diff --git a/src/main/cuda/headers/Matrix.h b/src/main/cuda/headers/Matrix.h
index 0446983d6bd..61ef939b837 100644
--- a/src/main/cuda/headers/Matrix.h
+++ b/src/main/cuda/headers/Matrix.h
@@ -26,19 +26,19 @@ using int32_t = int;
 
 template <typename T>
 struct Matrix {
-	int32_t nnz;
+	uint64_t nnz;
 	uint32_t rows;
 	uint32_t cols;
-	
 	uint32_t* row_ptr;
 	uint32_t* col_idx;
 	T* data;
 	
 	typedef T value_type;
 	
-	explicit Matrix(size_t* jvals) : nnz(jvals[0]), rows(jvals[1]), cols(jvals[2]),
-			row_ptr(reinterpret_cast<uint32_t*>(jvals[3])),
-			col_idx(reinterpret_cast<uint32_t*>((jvals[4]))), data(reinterpret_cast<T*>(jvals[5])) {}
+	explicit Matrix(uint8_t* jvals) : nnz(*reinterpret_cast<uint32_t*>(&jvals[0])),
+		rows(*reinterpret_cast<uint32_t*>(&jvals[8])), cols(*reinterpret_cast<uint32_t*>(&jvals[12])),
+			row_ptr(reinterpret_cast<uint32_t*>(jvals[16])), col_idx(reinterpret_cast<uint32_t*>((jvals[24]))),
+				data(static_cast<T*>(jvals[32])) {}
 };
 
 #ifdef __CUDACC__
@@ -72,7 +72,7 @@ class MatrixAccessor {
 	
 	__device__ void init(Matrix<T>* mat) { _mat = mat; }
 	
-	__device__ uint32_t& nnz() { return _mat->nnz; }
+	__device__ uint32_t& nnz() { return return _mat->row_ptr == nullptr ? _mat->rows * _mat->cols : _mat->nnz; }
 	__device__ uint32_t cols() { return _mat->cols; }
 	__device__ uint32_t rows() { return _mat->rows; }
 	
@@ -133,7 +133,7 @@ class MatrixAccessor {
 	
 	//ToDo sparse accessors
 	__device__ uint32_t len_sparse() {
-		return _mat->nnz;
+		return _mat->row_ptr[_mat->rows];
 	}
 	
 	__device__ uint32_t pos_sparse(uint32_t rix) {
@@ -227,34 +227,6 @@ class RingBuffer {
 	}
 };
 
-template <typename T, int NUM_B>
-struct SpoofOp {
-	MatrixAccessor<T> a;
-	MatrixAccessor<T> b[NUM_B];
-	MatrixAccessor<T> c;
-	T* scalars;
-	uint32_t grix;
-	T* avals;
-	uint32_t* aix;
-	uint32_t alen;
-	
-	SpoofOp(Matrix<T>* A, Matrix<T>* B, Matrix<T>* C, T* scalars, T* tmp_stor, uint32_t grix) :
-			scalars(scalars), grix(grix), avals(A->data), aix(A->col_idx) {
-		a.init(A);
-		c.init(C);
-		alen = a.row_len(grix);
-
-		if(B)
-			for(auto i = 0; i < NUM_B; ++i)
-				b[i].init(&(B[i]));
-	}
-	
-//	__device__ Vector<T>& getTempStorage(uint32_t len) {
-//		Vector<T>& vec = temp_rb.next();
-//		tvec.length = len;
-//		return vec;
-//	}
-};
 #endif // __CUDACC_RTC__
 
 #endif //SYSTEMDS_MATRIX_H
diff --git a/src/main/cuda/headers/reduction.cuh b/src/main/cuda/headers/reduction.cuh
index 3cd0a0b3946..7707fe2a40c 100644
--- a/src/main/cuda/headers/reduction.cuh
+++ b/src/main/cuda/headers/reduction.cuh
@@ -22,6 +22,7 @@
 #define REDUCTION_CUH
 
 using uint = unsigned int;
+
 #include <cuda_runtime.h>
 
 #include "utils.cuh"
@@ -51,7 +52,9 @@ using uint = unsigned int;
  * @param SpoofCellwiseOp		initial value for the reduction variable
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) {
+__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op,
+	SpoofCellwiseOp spoof_op)
+{
 	auto sdata = shared_memory_proxy<T>();
 
 	// perform first level of reduction,
@@ -66,12 +69,9 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 	// number of active thread blocks (via gridDim).  More blocks will result
 	// in a larger gridSize and therefore fewer elements per thread
 	while (i < N) {
-//		printf("tid=%d i=%d N=%d, in->cols()=%d rix=%d\n", threadIdx.x, i, N, in->cols(), i/in->cols());
 		v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / in->cols(), i % in->cols()));
 
 		if (i + blockDim.x < N)	{
-			//__syncthreads();
-			//printf("loop fetch i(%d)+blockDim.x(%d)=%d, in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]);
 			v = reduction_op(v, spoof_op(*(in->vals(i+blockDim.x)), blockDim.x + i, (i+blockDim.x) / in->cols(), (i+blockDim.x) % in->cols()));
 		}
 
@@ -116,40 +116,25 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 		if (blockDim.x >= 64) {
 			smem[tid] = v = reduction_op(v, smem[tid + 32]);
 		}
-//		if(tid<12)
-//			printf("bid=%d tid=%d reduction result: %3.1f\n", blockIdx.x, tid, sdata[tid]);
-		
 		if (blockDim.x >= 32) {
 			smem[tid] = v = reduction_op(v, smem[tid + 16]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 16) {
 			smem[tid] = v = reduction_op(v, smem[tid + 8]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 8) {
 			smem[tid] = v = reduction_op(v, smem[tid + 4]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 4) {
 			smem[tid] = v = reduction_op(v, smem[tid + 2]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 		if (blockDim.x >= 2) {
 			smem[tid] = v = reduction_op(v, smem[tid + 1]);
 		}
-//		if(tid==0)
-//			printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 	}
 
 	 // write result for this block to global mem
 	 if (tid == 0) {
-//	 	if(gridDim.x < 10)
-//	 		printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
 	 	out->val(0, blockIdx.x) = sdata[0];
 	 }
 }
@@ -174,19 +159,10 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
  * the value before writing it to its final location in global memory for each
  * row
  */
-//template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-//__device__ void ROW_AGG(
-//		T *g_idata, ///< input data stored in device memory (of size rows*cols)
-//		T *g_odata,  ///< output/temporary array store in device memory (of size
-//		/// rows*cols)
-//		uint rows,  ///< rows in input and temporary/output arrays
-//		uint cols,  ///< columns in input and temporary/output arrays
-//		T initialValue,  ///< initial value for the reduction variable
-//		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
-//		SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT,  ReductionOp reduction_op,
-					   SpoofCellwiseOp spoof_op) {
+	SpoofCellwiseOp spoof_op)
+{
 	auto sdata = shared_memory_proxy<T>();
 
 	// one block per row
@@ -199,7 +175,6 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
 	uint32_t i = tid;
 	uint block_offset = block * in->cols();
 
-//	T v = initialValue;
 	T v = reduction_op.init();
 	while (i < in->cols()) {
 		v = reduction_op(v, spoof_op(in->val(block_offset + i), i, i / in->cols(), i % in->cols()));
@@ -283,16 +258,8 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void COL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT,  ReductionOp reduction_op,
-						SpoofCellwiseOp spoof_op) {
-//__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols)
-//		T *g_odata,  ///< output/temporary array store in device memory (of size rows*cols)
-//		uint rows,  ///< rows in input and temporary/output arrays
-//		uint cols,  ///< columns in input and temporary/output arrays
-//		T initialValue,  ///< initial value for the reduction variable
-//		ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
-//		SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation
-//
-//{
+	SpoofCellwiseOp spoof_op)
+{
 	uint global_tid = blockIdx.x * blockDim.x + threadIdx.x;
 	if (global_tid >= in->cols()) {
 		return;
@@ -315,13 +282,12 @@ __device__ void NO_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N
 	uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x;
 	uint32_t first_idx = gtid * static_cast<uint32_t>(VT);
 	uint32_t last_idx = min(first_idx + static_cast<uint32_t>(VT), N);
+
 	#pragma unroll
 	for(auto i = first_idx; i < last_idx; i++) {
 		T a = in->hasData() ? in->vals(0)[i] : 0;
 		T result = spoof_op(a, i, i / in->cols(), i % in->cols());
 		out->vals(0)[i] = result;
-		//if(i < 4)
-		//	printf("tid=%d in=%4.3f res=%4.3f out=%4.3f r=%d\n", i, in->vals(0)[i], result, out->vals(0)[i], i/in->cols());
 	}
 }
 
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index 233a4a179c6..2ef482d18dd 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -77,10 +77,15 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resourc
 	CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
 	ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG, SpoofOperator::AggOp::MAX), func));
 	
+	CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(ctx->staging_buffer)), ctx->default_mem_size));
+	CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(ctx->device_buffer)), ctx->default_mem_size));
+	ctx->current_mem_size = ctx->default_mem_size;
 	return reinterpret_cast<size_t>(ctx);
 }
 
 void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]] uint32_t device_id) {
+	cudaFreeHost(ctx->staging_buffer);
+	cudaFree(ctx->device_buffer);
 	delete ctx;
 	// cuda device is handled by jCuda atm
 	//cudaDeviceReset();
@@ -116,15 +121,25 @@ size_t SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const std::s
 
 template<typename T>
 CUfunction SpoofCUDAContext::getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp> &key) {
-	return nullptr;
+	return nullptr; // generic case never used
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<float>(const std::pair<SpoofOperator::AggType,
 		SpoofOperator::AggOp> &key) {
 	return reduction_kernels_f[key];
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<double>(const std::pair<SpoofOperator::AggType,
 		SpoofOperator::AggOp> &key) {
 	return reduction_kernels_d[key];
-}
\ No newline at end of file
+}
+
+void SpoofCUDAContext::resize_staging_buffer(size_t size) {
+	cudaFreeHost(staging_buffer);
+	cudaFree(device_buffer);
+	CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(staging_buffer)), size));
+	CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(device_buffer)), size));
+	current_mem_size = size;
+}
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index 696682fc228..e4b80c5e401 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -46,62 +46,34 @@ class SpoofCUDAContext {
 	std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_f;
 	std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_d;
 
-//	double handling_total, compile_total;
-	
 	const std::string resource_path;
 	const std::vector<std::string> include_paths;
 	
 public:
+	size_t default_mem_size = 1024; // 1kb for hosting data pointers, scalars and some meta info. This default should
+									// not require resizing these buffers in most cases.
+	size_t current_mem_size = 0; // the actual staging buffer size (should be default unless there was a resize)
+	std::byte* staging_buffer{}; // pinned host mem for async transfers
+	std::byte* device_buffer{};  // this buffer holds the pointers to the data buffers
 
 	explicit SpoofCUDAContext(const char* resource_path_, std::vector<std::string>  include_paths_) : reductions(nullptr),
-			resource_path(resource_path_), include_paths(std::move(include_paths_))
-			//,handling_total(0.0), compile_total(0.0)
-			{}
+			resource_path(resource_path_), include_paths(std::move(include_paths_)) { }
 
 	static size_t initialize_cuda(uint32_t device_id, const char* resource_path_);
 
 	static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
-	
+
 	size_t compile(std::unique_ptr<SpoofOperator> op, const std::string &src);
-	
+
 	template <typename T, typename CALL>
-	int launch(uint32_t opID, std::vector<Matrix<T>>& input, std::vector<Matrix<T>>& sides, Matrix<T>& output,
-			T* scalars, uint32_t grix) {
-		// dp holds in/side/out/scalar pointers for GPU
-		DevMatPtrs<T> dp;
-
-		SpoofOperator* op = compiled_ops[opID].get();
-		
-		CHECK_CUDART(cudaMalloc((void **)&dp.in, sizeof(Matrix<T>) * input.size()));
-		CHECK_CUDART(cudaMemcpy(dp.in, reinterpret_cast<void*>(&input[0]), sizeof(Matrix<T>) * input.size(),
-				cudaMemcpyHostToDevice));
-
-		if (!sides.empty()) {
-			CHECK_CUDART(cudaMalloc(reinterpret_cast<void **>(&dp.sides), sizeof(Matrix<T>) * sides.size()));
-			CHECK_CUDART(cudaMemcpy(dp.sides, &sides[0], sizeof(Matrix<T>)  * sides.size(), cudaMemcpyHostToDevice));
-		}
-		
-		if (op->isSparseSafe() && input.front().row_ptr != nullptr) {
-			CHECK_CUDART(cudaMemcpy(output.row_ptr, input.front().row_ptr, (input.front().rows+1)*sizeof(uint32_t),
-					cudaMemcpyDeviceToDevice));
-		}
-#ifndef NDEBUG
-		std::cout << "output rows: " << output.rows << " cols: " << output.cols << " nnz: " << output.nnz << " format: " <<
-				(output.row_ptr == nullptr ? "dense" : "sparse") << std::endl;
-#endif
-		size_t out_num_elements = output.rows * output.cols;
-		if(output.row_ptr)
-			if(op->isSparseSafe() && output.nnz > 0)
-				out_num_elements = output.nnz;
-		CHECK_CUDART(cudaMalloc((void **) &dp.out, sizeof(Matrix<T>)));
-		CHECK_CUDART(cudaMemset(output.data, 0, out_num_elements * sizeof(T)));
-		CHECK_CUDART(cudaMemcpy(dp.out, reinterpret_cast<void *>(&output), sizeof(Matrix<T>),
-				cudaMemcpyHostToDevice));
-		
-		dp.scalars = scalars;
-
-		CALL::exec(this, op, input, sides, output, grix, dp);
-		
+	int launch() {
+
+		DataBufferWrapper dbw(staging_buffer, device_buffer);
+		SpoofOperator* op = compiled_ops[dbw.op_id()].get();
+		dbw.toDevice(op->stream);
+
+		CALL::exec(this, op, &dbw);
+
 		return 0;
 	}
 	
@@ -109,6 +81,8 @@ class SpoofCUDAContext {
 
 	template<typename T>
 	CUfunction getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>& key);
+
+	void resize_staging_buffer(size_t size);
 };
 
 #endif // SPOOFCUDACONTEXT_H
diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h b/src/main/cuda/spoof-launcher/SpoofCellwise.h
index fe7e9d66688..9077840020d 100644
--- a/src/main/cuda/spoof-launcher/SpoofCellwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h
@@ -27,8 +27,7 @@
 template<typename T>
 struct SpoofCellwiseFullAgg {
 	
-	static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name,
-			std::vector<Matrix<T>>& sides, uint32_t grix,  DevMatPtrs<T>& dp) {
+	static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) {
 		T value_type;
 		
 		// num ctas
@@ -46,14 +45,15 @@ struct SpoofCellwiseFullAgg {
 						  << std::endl;
 #endif
 		CHECK_CUDA(op->program.get()->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 		if(NB > 1) {
 			N = NB;
 			while (NB > 1) {
-				void* args[3] = { &dp.out, &dp.out, &N};
+				Matrix<T>* out = dbw->d_out<T>();
+				void* args[3] = { &out, &out, &N};
 				
 				NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
 #ifndef NDEBUG
@@ -64,7 +64,7 @@ struct SpoofCellwiseFullAgg {
                     << N << " elements"
                     << std::endl;
 #endif
-				CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, nullptr, args, nullptr));
+				CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, op->stream, args, nullptr));
 				N = NB;
 			}
 		}
@@ -74,12 +74,11 @@ struct SpoofCellwiseFullAgg {
 
 template<typename T>
 struct SpoofCellwiseRowAgg {
-	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name,
-			  std::vector<Matrix<T>> &input, std::vector<Matrix<T>> &sides, uint32_t grix, DevMatPtrs<T>& dp) {
+	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) {
 		T value_type;
 		
 		// num ctas
-		uint32_t NB = input.front().rows;
+		uint32_t NB = dbw->h_in<T>(0)->rows;
 		dim3 grid(NB, 1, 1);
 		dim3 block(NT, 1, 1);
 		uint32_t shared_mem_size = NT * sizeof(T);
@@ -90,9 +89,9 @@ struct SpoofCellwiseRowAgg {
 					<< N << " elements" << std::endl;
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 	}
 };
@@ -100,8 +99,7 @@ struct SpoofCellwiseRowAgg {
 
 template<typename T>
 struct SpoofCellwiseColAgg {
-	static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name,
-					 std::vector<Matrix<T>>& sides, uint32_t grix, DevMatPtrs<T>& dp) {
+	static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) {
 		T value_type;
 		
 		// num ctas
@@ -116,9 +114,9 @@ struct SpoofCellwiseColAgg {
 						<< N << " elements" << std::endl;
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
 		
 	}
 };
@@ -126,72 +124,78 @@ struct SpoofCellwiseColAgg {
 
 template<typename T>
 struct SpoofCellwiseNoAgg {
-	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name,
-			  std::vector<Matrix<T>> &input, std::vector<Matrix<T>> &sides, uint32_t grix, DevMatPtrs<T>& dp) {
+	static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) {
 		T value_type;
-		bool sparse_input = input.front().row_ptr != nullptr;
+		bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
 		
 		// num ctas
-		// ToDo: adaptive VT
+		// ToDo? adaptive VT
 		const uint32_t VT = 1;
 		uint32_t NB = std::ceil((N + NT * VT - 1) / (NT * VT));
 		if(sparse_input)
-			NB = input.front().rows;
+			NB = dbw->h_in<T>(0)->rows;
 		dim3 grid(NB, 1, 1);
 		dim3 block(NT, 1, 1);
 		uint32_t shared_mem_size = 0;
 
 #ifndef NDEBUG
+		std::cout << "output rows: " << dbw->h_out<T>()->rows << " cols: " << dbw->h_out<T>()->cols << " nnz: " <<
+			(dbw->h_out<T>()->row_ptr == nullptr ? dbw->h_out<T>()->rows * dbw->h_out<T>()->cols :
+				dbw->h_out<T>()->nnz) << " format: " << (dbw->h_out<T>()->row_ptr == nullptr
+					? "dense" : "sparse") << std::endl;
+
 		if(sparse_input) {
 				std::cout << "launching sparse spoof cellwise kernel " << op_name << " with " << NT * NB
-						  << " threads in " << NB << " blocks without aggregation for " << N << " elements"
-						  << std::endl;
+			  		<< " threads in " << NB << " blocks without aggregation for " << N << " elements" << std::endl;
 		}
 		else {
-			std::cout << "launching spoof cellwise kernel " << op_name << " with " << NT * NB
-					  << " threads in " << NB << " blocks without aggregation for " << N << " elements"
-					  << std::endl;
+			std::cout << "launching spoof cellwise kernel " << op_name << " with " << NT * NB << " threads in " << NB <<
+				" blocks without aggregation for " << N << " elements" << std::endl;
 		}
 #endif
-		
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()))
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, N, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides()))
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
+
+		// copy over row indices from input to output if appropriate
+		if (op->isSparseSafe() && dbw->h_in<T>(0)->row_ptr != nullptr) {
+			// src/dst information (pointer address) is stored in *host* buffer!
+			CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->row_ptr, dbw->h_in<T>(0)->row_ptr,
+				(dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream));
+			CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->col_idx, dbw->h_in<T>(0)->col_idx,
+										 (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream));
+		}
 	}
 };
 
 template<typename T>
 struct SpoofCellwise {
-	static void exec(SpoofCUDAContext* ctx, SpoofOperator* _op, std::vector<Matrix<T>>& input,
-			std::vector<Matrix<T>>& sides, Matrix<T>& output, uint32_t grix,
-			DevMatPtrs<T>& dp)  {
-		
-		T value_type;
+	static void exec(SpoofCUDAContext* ctx, SpoofOperator* _op, DataBufferWrapper* dbw) {
 		auto* op = dynamic_cast<SpoofCellwiseOp*>(_op);
-		bool sparse_input = input.front().row_ptr != nullptr;
+		bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
 		uint32_t NT = 256; // ToDo: num threads
-		uint32_t N = input.front().rows * input.front().cols;
+		uint32_t N = dbw->h_in<T>(0)->rows * dbw->h_in<T>(0)->cols;
 		std::string op_name(op->name + "_DENSE");
 		if(sparse_input) {
 			op_name = std::string(op->name + "_SPARSE");
-			if(op->isSparseSafe() && input.front().nnz > 0)
-				N = input.front().nnz;
+			if(op->isSparseSafe() && dbw->h_in<T>(0)->nnz > 0)
+				N = dbw->h_in<T>(0)->nnz;
 		}
 		
 		switch(op->agg_type) {
 			case SpoofOperator::AggType::FULL_AGG:
 				op->agg_kernel = ctx->template getReductionKernel<T>(std::make_pair(op->agg_type, op->agg_op));
-				SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, sides, grix, dp);
+				SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			case SpoofOperator::AggType::ROW_AGG:
-				SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, input, sides, grix, dp);
+				SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			case SpoofOperator::AggType::COL_AGG:
-				SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, sides, grix, dp);
+				SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			case SpoofOperator::AggType::NO_AGG:
-				SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, input, sides, grix, dp);
+				SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, dbw);
 				break;
 			default:
 				throw std::runtime_error("unknown cellwise agg type" + std::to_string(static_cast<int>(op->agg_type)));
diff --git a/src/main/cuda/spoof-launcher/SpoofOperator.h b/src/main/cuda/spoof-launcher/SpoofOperator.h
index 0ccc633a47f..f256e817db2 100644
--- a/src/main/cuda/spoof-launcher/SpoofOperator.h
+++ b/src/main/cuda/spoof-launcher/SpoofOperator.h
@@ -28,18 +28,24 @@
 #include "host_utils.h"
 #include "Matrix.h"
 
+// these two constants have equivalents in Java code:
+const uint32_t JNI_MAT_ENTRY_SIZE = 40;
+const uint32_t TRANSFERRED_DATA_HEADER_SIZE = 32;
+
 struct SpoofOperator {
-//	enum class OpType : int { CW, RA, MA, OP, NONE };
 	enum class AggType : int { NO_AGG, FULL_AGG, ROW_AGG, COL_AGG };
 	enum class AggOp : int { SUM, SUM_SQ, MIN, MAX };
 	enum class RowType : int { FULL_AGG = 4 };
-	
-//	OpType op_type;
+
 	std::string name;
-//	jitify::Program program;
 	std::unique_ptr<jitify::Program> program;
 	
 	[[nodiscard]] virtual bool isSparseSafe() const = 0;
+
+	cudaStream_t stream{};
+	
+	SpoofOperator() { CHECK_CUDART(cudaStreamCreate(&stream));}
+	virtual ~SpoofOperator() {CHECK_CUDART(cudaStreamDestroy(stream));}
 };
 
 struct SpoofCellwiseOp : public SpoofOperator {
@@ -58,46 +64,90 @@ struct SpoofRowwiseOp : public SpoofOperator {
 	int32_t const_dim2;
 	RowType row_type;
 	
-	SpoofRowwiseOp(RowType rt, bool tb1, uint32_t ntv, int32_t cd2)  : row_type(rt), TB1(tb1), num_temp_vectors(ntv),
+	SpoofRowwiseOp(RowType rt, bool tb1, uint32_t ntv, int32_t cd2) : row_type(rt), TB1(tb1), num_temp_vectors(ntv),
 			const_dim2(cd2) {}
 			
 	[[nodiscard]] bool isSparseSafe() const override { return false; }
 };
 
-template<typename T>
-struct DevMatPtrs {
-	Matrix<T>* ptrs[3] = {0,0,0};
-	
-	Matrix<T>*& in = ptrs[0];
-	Matrix<T>*& sides = ptrs[1];
-	Matrix<T>*& out = ptrs[2];
-	T* scalars{};
-
-	~DevMatPtrs() {
-#ifndef NDEBUG
-		std::cout << "~DevMatPtrs() before cudaFree:\n";
-		int i = 0;
-		for (auto& p : ptrs) {
-			std::cout << " p[" << i << "]=" << p;
-			i++;
-		}
-		std::cout << std::endl;
-#endif
-		for (auto& p : ptrs) {
-			if (p) {
-				CHECK_CUDART(cudaFree(p));
-				p = nullptr;
-			}
-		}
-#ifndef NDEBUG
-		std::cout << "~DevMatPtrs() after cudaFree:\n";
-		i = 0;
-		for (auto& p : ptrs) {
-			std::cout << " p[" << i << "]=" << p;
-			i++;
-		}
-		std::cout << std::endl;
-#endif
+struct DataBufferWrapper {
+	std::byte* staging_buffer;
+	std::byte* device_buffer;
+
+
+	template<typename T>
+	Matrix<T>* in(std::byte* buffer, uint32_t idx) {
+		return reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + idx * JNI_MAT_ENTRY_SIZE]);
+	}
+
+	template<typename T>
+	Matrix<T>* sides(std::byte* buffer) {
+		return reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + num_inputs() * JNI_MAT_ENTRY_SIZE]);
+	}
+
+	template<typename T>
+	Matrix<T>* out(std::byte* buffer) {
+		return reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + (num_inputs() + num_sides())
+			* JNI_MAT_ENTRY_SIZE]);
+	}
+
+	template<typename T>
+	T* scalars(std::byte* buffer, uint32_t idx) {
+		return reinterpret_cast<T*>(&(buffer[TRANSFERRED_DATA_HEADER_SIZE + (num_inputs() + num_sides() + 1)
+			* JNI_MAT_ENTRY_SIZE + idx * sizeof(T)]));
+	}
+
+public:
+	explicit DataBufferWrapper(std::byte* staging, std::byte* dev_buf) : staging_buffer(staging),
+		device_buffer(dev_buf) { }
+
+	void toDevice(cudaStream_t &stream) const {
+		CHECK_CUDART(cudaMemcpyAsync(device_buffer, staging_buffer, *reinterpret_cast<uint32_t*>(&staging_buffer[0]),
+			cudaMemcpyHostToDevice, stream));
+	}
+
+	template<typename T>
+	Matrix<T>* d_in(uint32_t num) { return in<T>(device_buffer, num); }
+
+	template<typename T>
+	Matrix<T>* h_in(uint32_t num) { return in<T>(staging_buffer, num); }
+
+	template<typename T>
+	Matrix<T>* d_sides() { return sides<T>(device_buffer); }
+
+	template<typename T>
+	Matrix<T>* h_sides() { return sides<T>(staging_buffer); }
+
+	template<typename T>
+	Matrix<T>* d_out() { return out<T>(device_buffer); }
+
+	template<typename T>
+	Matrix<T>* h_out() { return out<T>(staging_buffer); }
+
+	template<typename T>
+	T* d_scalars(uint32_t idx = 0) { return scalars<T>(device_buffer, idx); }
+
+	template<typename T>
+	T* h_scalars(uint32_t idx = 0) { return scalars<T>(staging_buffer, idx); }
+
+	uint32_t op_id() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[sizeof(int)]);
+	}
+
+	uint64_t grix() const {
+		return *reinterpret_cast<uint64_t*>(&staging_buffer[2 * sizeof(int)]);
+	}
+
+	[[nodiscard]] uint32_t num_inputs() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[3 * sizeof(int)]);
+	}
+
+	uint32_t num_sides() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[4 * sizeof(int)]);
+	}
+
+	uint32_t num_scalars() const {
+		return *reinterpret_cast<uint32_t*>(&staging_buffer[6 * sizeof(int)]);
 	}
 };
 
diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h b/src/main/cuda/spoof-launcher/SpoofRowwise.h
index 12953141098..4465ac99fa4 100644
--- a/src/main/cuda/spoof-launcher/SpoofRowwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h
@@ -27,27 +27,34 @@
 template <typename T>
 struct SpoofRowwise {
 	
-	static void exec([[maybe_unused]] SpoofCUDAContext* ctx, SpoofOperator* _op, std::vector<Matrix<T>>& input,
-			std::vector<Matrix<T>>& sides, Matrix<T>& output, uint32_t grix, DevMatPtrs<T>& dp)  {
+	static void exec([[maybe_unused]] SpoofCUDAContext* ctx, SpoofOperator* _op, DataBufferWrapper* dbw)  {
 		uint32_t NT=256;
 		T value_type;
-		bool sparse_input = input.front().row_ptr != nullptr;
+		bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
 		auto* op = dynamic_cast<SpoofRowwiseOp*>(_op);
-		dim3 grid(input.front().rows, 1, 1);
+		dim3 grid(dbw->h_in<T>(0)->rows, 1, 1);
 		dim3 block(NT, 1, 1);
 		unsigned int shared_mem_size = NT * sizeof(T);
-		
+
+		size_t out_num_elements = dbw->h_out<T>()->rows * dbw->h_out<T>()->cols;
+		if(dbw->h_out<T>()->row_ptr)
+			if(op->isSparseSafe() && dbw->h_out<T>()->nnz > 0)
+				out_num_elements = dbw->h_out<T>()->nnz;
+		//ToDo: only memset output when there is an output operation that *adds* to the buffer
+		CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, out_num_elements * sizeof(T), op->stream));
+
+		//ToDo: handle this in JVM
 		uint32_t tmp_len = 0;
 		uint32_t temp_buf_size = 0;
 		T* d_temp = nullptr;
 		if(op->num_temp_vectors > 0) {
-			tmp_len = std::max(input.front().cols, op->const_dim2 < 0 ? 0 : static_cast<uint32_t>(op->const_dim2));
-			temp_buf_size = op->num_temp_vectors * tmp_len * input.front().rows * sizeof(T);
+			tmp_len = std::max(dbw->h_in<T>(0)->cols, op->const_dim2 < 0 ? 0u : static_cast<uint32_t>(op->const_dim2));
+			temp_buf_size = op->num_temp_vectors * tmp_len * dbw->h_in<T>(0)->rows * sizeof(T);
 #ifndef NDEBUG
 			std::cout << "num_temp_vect: " << op->num_temp_vectors << " temp_buf_size: " << temp_buf_size << " tmp_len: " << tmp_len << std::endl;
 #endif
 			CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size));
-			CHECK_CUDART(cudaMemset(d_temp, 0, temp_buf_size));
+			CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, op->stream));
 		}
 		
 		std::string op_name(op->name + "_DENSE");
@@ -56,21 +63,18 @@ struct SpoofRowwise {
 
 #ifndef NDEBUG
 		// ToDo: connect output to SystemDS logging facilities
-		std::cout << "launching spoof rowwise kernel " << op_name << " with " << NT * input.front().rows << " threads in "
-				<< input.front().rows << " blocks and " << shared_mem_size << " bytes of shared memory for "
-				<< input.front().rows << " cols processed by " << NT << " threads per row, adding "
+		std::cout << "launching spoof rowwise kernel " << op_name << " with " << NT * dbw->h_in<T>(0)->rows << " threads in "
+				<< dbw->h_in<T>(0)->rows << " blocks and " << shared_mem_size << " bytes of shared memory for "
+				<< dbw->h_in<T>(0)->rows << " cols processed by " << NT << " threads per row, adding "
 				<< temp_buf_size / 1024 << " kb of temp buffer in global memory." <<  std::endl;
 #endif
 		CHECK_CUDA(op->program->kernel(op_name)
-						   .instantiate(type_of(value_type), std::max(static_cast<size_t>(1), sides.size()), op->num_temp_vectors, tmp_len)
-						   .configure(grid, block, shared_mem_size)
-						   .launch(dp.in, dp.sides, dp.out, dp.scalars, d_temp, grix));
+						   .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1), dbw->num_sides()), op->num_temp_vectors, tmp_len)
+						   .configure(grid, block, shared_mem_size, op->stream)
+						   .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), d_temp, dbw->grix()));
 		
 		if(op->num_temp_vectors > 0)
 			CHECK_CUDART(cudaFree(d_temp));
-		
-//		if (op->TB1)
-//			CHECK_CUDART(cudaFree(b1_transposed));
 	}
 };
 
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp b/src/main/cuda/spoof-launcher/jni_bridge.cpp
index 9a808a62731..5134d5e292c 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.cpp
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -23,69 +23,86 @@
 #include "SpoofRowwise.h"
 
 // JNI Methods to get/release arrays
-#define GET_ARRAY(env, input)((void *)env->GetPrimitiveArrayCritical(input, nullptr))
+//#define GET_ARRAY(env, input)((void *)env->GetPrimitiveArrayCritical(input, nullptr))
+//#define RELEASE_ARRAY(env, java, cpp)(env->ReleasePrimitiveArrayCritical(java, cpp, 0))
 
-#define RELEASE_ARRAY(env, java, cpp)(env->ReleasePrimitiveArrayCritical(java, cpp, 0))
+jclass jcuda_pointer_class;
+jclass jcuda_native_pointer_class;
+jfieldID pointer_buffer_field;
+jfieldID native_pointer_field;
 
 // error output helper
-void printException(const std::string& name, const std::exception& e, bool compile = false) {
+void printException(const std::string &name, const std::exception &e, bool compile = false) {
 	std::string type = compile ? "compiling" : "executing";
-	std::cout << "std::exception while " << type << "  SPOOF CUDA operator " << name << ":\n" << e.what() << std::endl;
+	std::cerr << "std::exception while " << type << "  SPOOF CUDA operator " << name << ":\n" << e.what() << std::endl;
 }
 
 
-// a pod struct to have names for the passed pointers
-template<typename T>
-struct LaunchMetadata {
-	const T& opID;
-	const T& grix;
-	const size_t& num_inputs;
-	const size_t& num_sides;
-	
-	// num entries describing one matrix (6 entries):
-	// {nnz,rows,cols,row_ptr,col_idxs,data}
-	const size_t& entry_size;
-	const T& num_scalars;
-	
-	explicit LaunchMetadata(const size_t* jvals) : opID(jvals[0]), grix(jvals[1]), num_inputs(jvals[2]),
-			num_sides(jvals[3]), entry_size(jvals[4]), num_scalars(jvals[5]) {}
-};
-
-
-[[maybe_unused]] JNIEXPORT jlong JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
-    	JNIEnv *jenv, [[maybe_unused]] jobject jobj, jint device_id, jstring resource_path) {
+[[maybe_unused]] JNIEXPORT jlong JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context
+	(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jint device_id, jstring resource_path)
+{
 	const char *cstr_rp = jenv->GetStringUTFChars(resource_path, nullptr);
 	size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
 	jenv->ReleaseStringUTFChars(resource_path, cstr_rp);
+
+	// fetch some jcuda class handles
+	jcuda_pointer_class = jenv->FindClass("jcuda/Pointer");
+	jcuda_native_pointer_class = jenv->FindClass("jcuda/NativePointerObject");
+	pointer_buffer_field = jenv->GetFieldID(jcuda_pointer_class, "buffer", "Ljava/nio/Buffer;");
+	native_pointer_field = jenv->GetFieldID(jcuda_native_pointer_class, "nativePointer", "J");
+
+	// explicit cast to make compiler and linter happy
 	return static_cast<jlong>(ctx);
 }
 
 
-[[maybe_unused]] JNIEXPORT void JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
-		[[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong ctx, jint device_id) {
+[[maybe_unused]] JNIEXPORT void JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context
+	([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong ctx, jint device_id) {
 	SpoofCUDAContext::destroy_cuda(reinterpret_cast<SpoofCUDAContext *>(ctx), device_id);
 }
 
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofOperator_getNativeStagingBuffer
+	(JNIEnv *jenv, [[maybe_unused]] jclass jobj, jobject ptr, jlong _ctx, jint size) {
+	std::string operator_name("SpoofOperator_getNativeStagingBuffer");
+	try {
+		// retrieve data handles from JVM
+		auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
+		if (size > ctx->current_mem_size)
+			ctx->resize_staging_buffer(size);
+
+		jobject object = jenv->NewDirectByteBuffer(ctx->staging_buffer, size);
+		jenv->SetObjectField(ptr, pointer_buffer_field, object);
+		jenv->SetLongField(ptr,native_pointer_field, reinterpret_cast<jlong>(ctx->staging_buffer));
+
+		return 0;
+	}
+	catch (std::exception & e) {
+		printException(operator_name, e);
+	}
+	catch (...) {
+		printException(operator_name, std::runtime_error("unknown exception"), true);
+	}
+	return -1;
+}
 
 template<typename TEMPLATE>
-int compile_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong _ctx, jstring name, jstring src, TEMPLATE op) {
+int compile_spoof_operator
+	(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong _ctx, jstring name, jstring src, TEMPLATE op) {
 	std::string operator_name;
 	try {
 		auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
 		const char *cstr_name = jenv->GetStringUTFChars(name, nullptr);
 		const char *cstr_src = jenv->GetStringUTFChars(src, nullptr);
 		operator_name = cstr_name;
-
 		op->name = operator_name;
+
 		int status = ctx->compile(std::move(op), cstr_src);
-		
+
 		jenv->ReleaseStringUTFChars(src, cstr_src);
 		jenv->ReleaseStringUTFChars(name, cstr_name);
 		return status;
 	}
-	catch (std::exception& e) {
+	catch (std::exception &e) {
 		printException(operator_name, e, true);
 	}
 	catch (...) {
@@ -96,73 +113,41 @@ int compile_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong _c
 
 
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_hops_codegen_cplan_CNodeCell_compile_1nvrtc
-		(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint agg_op,
-				jboolean sparseSafe) {
-	
+	(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint agg_op, jboolean sparseSafe) {
 	std::unique_ptr<SpoofCellwiseOp> op = std::make_unique<SpoofCellwiseOp>(SpoofOperator::AggType(type),
-			SpoofOperator::AggOp(agg_op), sparseSafe);
-	
+																			SpoofOperator::AggOp(agg_op), sparseSafe);
+
 	return compile_spoof_operator<std::unique_ptr<SpoofCellwiseOp>>(jenv, jobj, ctx, name, src, std::move(op));
 }
 
-
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_hops_codegen_cplan_CNodeRow_compile_1nvrtc
-		(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint const_dim2,
-				jint num_vectors, jboolean TB1) {
-	
+	(JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint type, jint const_dim2,
+	 jint num_vectors, jboolean TB1) {
 	std::unique_ptr<SpoofRowwiseOp> op = std::make_unique<SpoofRowwiseOp>(SpoofOperator::RowType(type), TB1,
-			num_vectors, const_dim2);
+																		  num_vectors, const_dim2);
 	return compile_spoof_operator<std::unique_ptr<SpoofRowwiseOp>>(jenv, jobj, ctx, name, src, std::move(op));
 }
 
 
 template<typename T, typename TEMPLATE>
-int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jclass jobj, jlong _ctx, jlongArray _meta, jlongArray in,
-		jlongArray _sides, jlongArray out, jlong _scalars) {
-	std::string operator_name("unknown");
+int launch_spoof_operator([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jclass jobj, jlong _ctx) {
+	std::string operator_name("launch_spoof_operator jni-bridge");
 	try {
 		// retrieve data handles from JVM
-		auto *metacast = reinterpret_cast<size_t *>(GET_ARRAY(jenv, _meta));
 		auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
-		auto *inputs = reinterpret_cast<size_t *>(GET_ARRAY(jenv, in));
-		auto *sides = reinterpret_cast<size_t *>(GET_ARRAY(jenv, _sides));
-		auto *output = reinterpret_cast<size_t *>(GET_ARRAY(jenv, out));
-//		auto *scalars = reinterpret_cast<T *>(GET_ARRAY(jenv, _scalars));
-		auto *scalars = reinterpret_cast<T *>(_scalars);
-		LaunchMetadata<size_t> meta(metacast);
-		
+
+#ifndef NDEBUG
+		uint32_t opID = *reinterpret_cast<uint32_t*>(&ctx->staging_buffer[sizeof(uint32_t)]);
 		// this implicitly checks if op exists
-		operator_name = ctx->getOperatorName(meta.opID);
-		
-		// wrap/cast inputs
-		std::vector<Matrix<T>> mats_in;
-		for(auto i = 0ul; i < meta.num_inputs; i+=meta.entry_size)
-			mats_in.emplace_back(&inputs[i]);
-		
-		// wrap/cast sides
-		std::vector<Matrix<T>> mats_sides;
-		for(auto i = 0ul; i < meta.num_sides; i+=meta.entry_size)
-			mats_sides.emplace_back(&sides[i]);
-		
-		// wrap/cast output
-		Matrix<T> mat_out(output);
-		
-		// wrap/cast scalars
-//		std::unique_ptr<Matrix<T>> mat_scalars = scalars == nullptr ? 0 : std::make_unique<Matrix<T>>(scalars);
-		
+		operator_name = ctx->getOperatorName(opID);
+		std::cout << "executing op=" << operator_name << " id=" << opID << std::endl;
+#endif
 		// transfers resource pointers to GPU and calls op->exec()
-		ctx->launch<T, TEMPLATE>(meta.opID, mats_in, mats_sides, mat_out, scalars, meta.grix);
-		
-		// release data handles from JVM
-		RELEASE_ARRAY(jenv, _meta, metacast);
-		RELEASE_ARRAY(jenv, in, inputs);
-		RELEASE_ARRAY(jenv, _sides, sides);
-		RELEASE_ARRAY(jenv, out, output);
-//		RELEASE_ARRAY(jenv, _scalars, scalars);
-		
+		ctx->launch<T, TEMPLATE>();
+
 		return 0;
 	}
-	catch (std::exception& e) {
+	catch (std::exception &e) {
 		printException(operator_name, e);
 	}
 	catch (...) {
@@ -171,26 +156,26 @@ int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jclass jobj, jlong _ctx
 	return -1;
 }
 
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<float, SpoofCellwise<float>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
-}
 
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1d
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, ctx);
 }
 
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<float, SpoofRowwise<float>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
+
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, ctx);
 }
 
+
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1d
-		(JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, jlongArray in, jlongArray sides, jlongArray out,
-		 jlong scalars) {
-	return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, ctx, meta, in, sides, out, scalars);
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, ctx);
 }
+
+
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
+	(JNIEnv *jenv, jclass jobj, jlong ctx) {
+	return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, ctx);
+}
\ No newline at end of file
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.h b/src/main/cuda/spoof-launcher/jni_bridge.h
index 1e9ef20d1a2..0f25936c1a6 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.h
+++ b/src/main/cuda/spoof-launcher/jni_bridge.h
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/* DO NOT EDIT THIS FILE - it is machine generated */
+/* DO EDIT THIS FILE - it is not machine generated */
 
 #pragma once
 #ifndef JNI_BRIDGE_H
@@ -32,20 +32,26 @@ extern "C" {
 /*
  * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
  * Method:    initialize_cuda_context
- * Signature: (I)J
+ * Signature: (ILjava/lang/String;)J
  */
-[[maybe_unused]] JNIEXPORT jlong JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
-    JNIEnv *, [[maybe_unused]] jobject, jint, jstring);
+[[maybe_unused]] JNIEXPORT jlong JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context
+	(JNIEnv *, [[maybe_unused]] jobject, jint, jstring);
 
 /*
  * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
  * Method:    destroy_cuda_context
  * Signature: (JI)V
  */
-[[maybe_unused]] JNIEXPORT void JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
-		[[maybe_unused]] JNIEnv *, [[maybe_unused]] jobject, jlong, jint);
+[[maybe_unused]] JNIEXPORT void JNICALL Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context
+	([[maybe_unused]] JNIEnv *, [[maybe_unused]] jobject, jlong, jint);
+
+/*
+ * Class:     org_apache_sysds_runtime_codegen_SpoofOperator
+ * Method:    getNativeStagingBuffer
+ * Signature: (Ljcuda/Pointer;JI)I
+ */
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofOperator_getNativeStagingBuffer
+	(JNIEnv *, jclass, jobject, jlong, jint);
 
 /*
  * Class:     org_apache_sysds_hops_codegen_cplan_CNodeCell
@@ -64,37 +70,36 @@ Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
 		(JNIEnv *, [[maybe_unused]] jobject, jlong, jstring, jstring, jint, jint, jint, jboolean);
 
 /*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwiseOperator
- * Method:    execute_f
- * Signature: (J[J[J[J[JJ)I
- */
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
-
-/*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwiseOperator
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwise
  * Method:    execute_d
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1d
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
-
+	(JNIEnv *, jclass, jlong);
 
 /*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwise
  * Method:    execute_f
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
-[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
+	(JNIEnv *, jclass, jlong);
 
 /*
  * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
  * Method:    execute_d
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
 [[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1d
-		(JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, jlongArray, jlong);
+	(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
+ * Method:    execute_f
+ * Signature: (J)I
+ */
+[[maybe_unused]] JNIEXPORT jint JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
+	(JNIEnv *, jclass, jlong);
 
 #ifdef __cplusplus
 }
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
index 951f70901bd..6751b1db849 100644
--- a/src/main/cuda/spoof/cellwise.cu
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -42,7 +42,8 @@ struct SpoofCellwiseOp {
 		uint32_t _grix;
 
 	SpoofCellwiseOp(Matrix<T>* _A, Matrix<T>* _B, Matrix<T>* _C, T* scalars, uint32_t grix) :
-			n(_A->cols), scalars(scalars), _grix(grix) {
+		n(_A->cols), scalars(scalars), _grix(grix)
+	{
 		A.init(_A);
 		c.init(_C);
 		alen = A.row_len(grix);
diff --git a/src/main/cuda/spoof/rowwise.cu b/src/main/cuda/spoof/rowwise.cu
index 7d80d54e82a..b31ce0c2ce5 100644
--- a/src/main/cuda/spoof/rowwise.cu
+++ b/src/main/cuda/spoof/rowwise.cu
@@ -55,6 +55,8 @@ struct SpoofRowwiseOp //%HAS_TEMP_VECT%
 
 	__device__  __forceinline__ void exec_dense(uint32_t ai, uint32_t ci, uint32_t rix) {
 //%BODY_dense%
+		if (debug_row() && debug_thread())
+			printf("c[0]=%4.3f\n", c.vals(0)[0]);
 	}
 
 	__device__  __forceinline__ void exec_sparse(uint32_t ai, uint32_t ci, uint32_t rix) {
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index 47a7178f376..f8030007288 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -156,10 +156,12 @@ public String codegen(boolean sparse, GeneratorAPI _api) {
 		if(tmpDense.contains("grix"))
 			tmp = tmp.replace("//%NEED_GRIX%", "\t\tuint32_t grix=_grix + rix;");
 		else
-			tmp = tmp.replace("//%NEED_GRIX%", "");
-		tmp = tmp.replace("//%NEED_RIX%", "");
-		tmp = tmp.replace("//%NEED_CIX%", "");
-		
+			tmp = tmp.replace("//%NEED_GRIX%\n", "");
+
+		// remove empty lines
+//		if(!api.isJava())
+//			tmpDense = tmpDense.replaceAll("(?m)^[ \t]*\r?\n", "");
+
 		tmp = tmp.replace("%BODY_dense%", tmpDense);
 		
 		//Return last TMP. Square it for CUDA+SUM_SQ
@@ -171,10 +173,7 @@ public String codegen(boolean sparse, GeneratorAPI _api) {
 		tmp = tmp.replace("%AGG_OP_NAME%", (_aggOp != null) ? "AggOp." + _aggOp.name() : "null");
 		tmp = tmp.replace("%SPARSE_SAFE%", String.valueOf(isSparseSafe()));
 		tmp = tmp.replace("%SEQ%", String.valueOf(containsSeq()));
-		
-		// maybe empty lines
-		//tmp = tmp.replaceAll("(?m)^[ \t]*\r?\n", "");
-		
+
 		if(api == GeneratorAPI.CUDA) {
 			// ToDo: initial_value is misused to pass VT (values per thread) to no_agg operator
 			String agg_op = "IdentityOp";
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
index 0e21017e2b5..f2405d5b5c5 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
@@ -191,7 +191,7 @@ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
 				case MULT2:
 					return "	T %TMP% = %IN1% + %IN1%;\n";
 				case ABS:
-					return "	T %TMP% = fabs(%IN1%);\n";
+					return "\t\tT %TMP% = fabs(%IN1%);\n";
 				case SIN:
 					return "	T %TMP% = sin(%IN1%);\n";
 				case COS:
@@ -217,7 +217,7 @@ public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) {
 				case LOG:
 					return "		T %TMP% = log(%IN1%);\n";
 				case ROUND:
-					return "	T %TMP% = round(%IN1%);\n";
+					return "\t\tT %TMP% = round(%IN1%);\n";
 				case CEIL:
 					return "	T %TMP% = ceil(%IN1%);\n";
 				case FLOOR:
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
index 05109413ce0..5a3211435a7 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
@@ -23,11 +23,11 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
@@ -38,101 +38,75 @@ public class SpoofCUDACellwise extends SpoofCellwise implements SpoofCUDAOperato
 	private static final Log LOG = LogFactory.getLog(SpoofCUDACellwise.class.getName());
 	private final int ID;
 	private final PrecisionProxy call;
-	private Pointer ptr;
 	private final SpoofCellwise fallback_java_op;
+	private final long ctx;
 	
 	public SpoofCUDACellwise(CellType type, boolean sparseSafe, boolean containsSeq, AggOp aggOp, int id,
-			PrecisionProxy ep, SpoofCellwise fallback) {
+		PrecisionProxy ep, SpoofCellwise fallback)
+	{
 		super(type, sparseSafe, containsSeq, aggOp);
 		ID = id;
 		call = ep;
-		ptr = null;
 		fallback_java_op = fallback;
+		ctx = SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);
 	}
 	
 	@Override
-	public ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects) {
+	public ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
+		ArrayList<ScalarObject> scalarObjects)
+	{
 		double[] result = new double[1];
-		// ToDo: this is a temporary "solution" before perf opt
-		int NT=256;
-		long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns();
-		long num_blocks = ((N + NT * 2 - 1) / (NT * 2));
-		Pointer ptr = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks);
-		long[] out = {1,1,1, 0, 0, GPUObject.getPointerAddress(ptr)};
-		int offset = 1;
-		if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, offset), 
-			prepareSideInputPointers(ec, inputs, offset, false), out, scalarObjects, 0 ) != 0) {
-			LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.\n");
-			// ToDo: java fallback
+		Pointer[] ptr = new Pointer[1];
+		packDataForTransfer(ec, inputs, scalarObjects, null, 1, ID, 0,false, ptr);
+		if(NotEmpty(inputs.get(0).getGPUObject(ec.getGPUContext(0)))) {
+			if(call.exec(this) != 0)
+				LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
 		}
-		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, result, getName(), false);
-		
+		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr[0], result, getName(), false);
+		ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr[0], DMLScript.EAGER_CUDA_FREE);
 		return new DoubleObject(result[0]);
 	}
 	
 	@Override public String getName() {
 		return getSpoofType();
 	}
-	
-	@Override public void setScalarPtr(Pointer _ptr) {
-		ptr = _ptr;
-	}
-	
-	@Override public Pointer getScalarPtr() {
-		return ptr;
-	}
-	
-	@Override public void releaseScalarGPUMemory(ExecutionContext ec) {
-		if(ptr != null) {
-			ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr, DMLScript.EAGER_CUDA_FREE);
-			ptr = null;
-		}
-	}
-	
+
 	@Override
-	public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects,
-			String outputName) {
-		
+	public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
+		ArrayList<ScalarObject> scalarObjects, String outputName)
+	{
 		long out_rows = ec.getMatrixObject(outputName).getNumRows();
 		long out_cols = ec.getMatrixObject(outputName).getNumColumns();
-		MatrixObject a = inputs.get(0);
-		GPUContext gctx = ec.getGPUContext(0);
-		int m = (int) a.getNumRows();
-		int n = (int) a.getNumColumns();
-		double[] scalars = prepInputScalars(scalarObjects);
+
 		if(_type == CellType.COL_AGG)
 			out_rows = 1;
 		else if(_type == SpoofCellwise.CellType.ROW_AGG)
 			out_cols = 1;
-		
+
+		double[] scalars = prepInputScalars(scalarObjects);
 		boolean sparseSafe = isSparseSafe() || ((inputs.size() < 2) && 
-				genexec( 0, new SideInput[0], scalars, m, n, 0, 0 ) == 0);
-		
-//		ec.setMetaData(outputName, out_rows, out_cols);
-		GPUObject g = a.getGPUObject(gctx);
-		boolean sparseOut = _type == CellType.NO_AGG && sparseSafe && g.isSparse();
-		
-		long nnz = g.getNnz("spoofCUDA" + getSpoofType(), false);
-		if(sparseOut)
-			LOG.warn("sparse out");
+				genexec( 0, new SideInput[0], scalars, (int) inputs.get(0).getNumRows(),
+					(int) inputs.get(0).getNumColumns(), 0, 0 ) == 0);
+
+		GPUObject in_obj = inputs.get(0).getGPUObject(ec.getGPUContext(0));
+		boolean sparseOut = _type == CellType.NO_AGG && sparseSafe && in_obj.isSparse();
+		long nnz = in_obj.getNnz("spoofCUDA" + getSpoofType(), false);
 		MatrixObject out_obj = sparseOut ?
 				(ec.getSparseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, (isSparseSafe() && nnz > 0) ?
 						nnz : out_rows * out_cols).getKey()) :
 				(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols).getKey());
-		
-		int offset = 1;
-		if(!inputIsEmpty(a.getGPUObject(gctx)) || !sparseSafe) {
-			if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, offset), prepareSideInputPointers(ec, inputs, offset, false),
-				prepareOutputPointers(ec, out_obj, sparseOut), scalarObjects, 0) != 0) {
-				LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.(ToDo)\n");
-				// ToDo: java fallback
-			}
+
+		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,false, null);
+		if(NotEmpty(in_obj) || !sparseSafe) {
+			if(call.exec(this) != 0)
+				LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
 		}
 		return out_obj;
 	}
 	
-	private static boolean inputIsEmpty(GPUObject g) {
-		return g.getDensePointer() == null && g.getSparseMatrixCudaPointer() == null;
+	private static boolean NotEmpty(GPUObject g) {
+		// ToDo: check if that check is sufficient
+		return g.getDensePointer() != null || g.getSparseMatrixCudaPointer() != null;
 	}
 	
 	// used to determine sparse safety
@@ -140,15 +114,11 @@ private static boolean inputIsEmpty(GPUObject g) {
 	protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long gix, int rix, int cix) {
 		return fallback_java_op.genexec(a, b, scalars, m, n, 0, 0, 0);
 	}
-	
-	public int execute_sp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_f(ctx, meta, in, sides, out, scalars);	
-	}
-	
-	public int execute_dp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_d(ctx, meta, in, sides, out, scalars);
-	}
-	
-	public static native int execute_f(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
-	public static native int execute_d(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
+
+	public int execute_dp(long ctx) { return execute_d(ctx); }
+	public int execute_sp(long ctx) { return execute_d(ctx); }
+	public long getContext() { return ctx; }
+
+	public static native int execute_d(long ctx);
+	public static native int execute_s(long ctx);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
index 118f97a0e13..02fbf961aad 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
@@ -19,7 +19,10 @@
 
 package org.apache.sysds.runtime.codegen;
 
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import jcuda.Pointer;
+
 import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
@@ -27,146 +30,112 @@
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
-import java.util.ArrayList;
-
+import static jcuda.runtime.cudaError.cudaSuccess;
 import static org.apache.sysds.runtime.matrix.data.LibMatrixCUDA.sizeOfDataType;
 
 public interface SpoofCUDAOperator  {
-	int JNI_MAT_ENTRY_SIZE = 6;
+	// these two constants have equivalences in native code:
+	int JNI_MAT_ENTRY_SIZE = 40;
+	int TRANSFERRED_DATA_HEADER_SIZE = 32;
+
 	abstract class PrecisionProxy {
 		protected final long ctx;
 		
 		public PrecisionProxy() { ctx = SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);	}
 		
-		public abstract int exec(ExecutionContext ec, SpoofCUDAOperator op, int opID, long[] in, long[] sides, long[] out,
-				ArrayList<ScalarObject> scalarObjects, long grix);
-		
-		protected Pointer transferScalars(ExecutionContext ec, SpoofCUDAOperator op, int sizeOfDataType,
-				ArrayList<ScalarObject> scalarObjects) {
-			double[] s = SpoofOperator.prepInputScalars(scalarObjects);
-			Pointer ptr = ec.getGPUContext(0).allocate(op.getName(), (long) scalarObjects.size() * sizeOfDataType);
-			LibMatrixCUDA.cudaSupportFunctions.hostToDevice(ec.getGPUContext(0), s, ptr, op.getName());
-			return ptr;
-		}
+		public abstract int exec(SpoofCUDAOperator op);
 	}
 	
 	String getName();
-	
-	void setScalarPtr(Pointer ptr);
-	
-	Pointer getScalarPtr();
-	
-	void releaseScalarGPUMemory(ExecutionContext ec);
-	
-	default long [] prepareInputPointers(ExecutionContext ec, ArrayList<MatrixObject> inputs, int offset) {
-		long [] in = new long[offset * JNI_MAT_ENTRY_SIZE];
-		for(int i = 0; i < offset; i++) {
-			int j = i  * JNI_MAT_ENTRY_SIZE;
-			
-			if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) {
-				in[j] = ec.getGPUSparsePointerAddress(inputs.get(i)).nnz;
-				in[j + 1] = inputs.get(i).getNumRows();
-				in[j + 2] = inputs.get(i).getNumColumns();
-				in[j + 3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr);
-				in[j + 4] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd);
-				in[j + 5] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val);
-			}
-			else {
-				in[j] = inputs.get(i).getNnz();
-				in[j + 1] = inputs.get(i).getNumRows();
-				in[j + 2] = inputs.get(i).getNumColumns();
-				in[j + 5] = ec.getGPUDensePointerAddress(inputs.get(i));
-			}
-		}
-		return in;
+
+	default void writeMatrixDescriptorToBuffer(ByteBuffer dst, int rows, int cols, long row_ptr,
+		long col_idx_ptr, long data_ptr, long nnz)
+	{
+		dst.putLong(nnz);
+		dst.putInt(rows);
+		dst.putInt(cols);
+		dst.putLong(row_ptr);
+		dst.putLong(col_idx_ptr);
+		dst.putLong(data_ptr);
 	}
-	
-	default long [] prepareSideInputPointers(ExecutionContext ec, ArrayList<MatrixObject> inputs, int offset, boolean tB1) {
-		long[] sides = new long[(inputs.size() - offset) * JNI_MAT_ENTRY_SIZE];
-		for(int i = offset; i < inputs.size(); i++) {
-			int j = (i - offset)  * JNI_MAT_ENTRY_SIZE;
-			if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) {
-				sides[j] = ec.getGPUSparsePointerAddress(inputs.get(i)).nnz;
-				sides[j + 1] = inputs.get(i).getNumRows();
-				sides[j + 2] = inputs.get(i).getNumColumns();
-				sides[j + 3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr);
-				sides[j + 4] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd);
-				sides[j + 5] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val);
-			}
-			else {
-				if(tB1 && j == 0) {
-					long rows = inputs.get(i).getNumRows();
-					long cols = inputs.get(i).getNumColumns();
-					Pointer b1 = inputs.get(i).getGPUObject(ec.getGPUContext(0)).getDensePointer();
-					Pointer ptr = ec.getGPUContext(0).allocate(getName(), rows * cols * sizeOfDataType);
-					
-//					double[] tmp1 = new double[(int) (rows * cols)];
-//					LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), b1, tmp1, getName(), false);
-//
-//					System.out.println("Mat before transpose: rows=" + rows + " cols=" + cols + "\n");
-//					for(int m = 0; m < rows; m++) {
-//						StringBuilder sb = new StringBuilder();
-//						for(int n = 0; n < cols; n++)
-//							sb.append(" " + tmp1[(int) (cols * m + n)]);
-//						System.out.println(sb.toString());
-//					}
-					
-					LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(),
-						b1, ptr, rows, cols);
-					
-//					double[] tmp2 = new double[(int) (rows * cols)];
-//					LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, tmp2, getName(), false);
-//
-//					System.out.println("Mat after transpose: rows=" + cols + " cols=" + rows + "\n");
-//					for(int m = 0; m < cols; m++) {
-//						StringBuilder sb = new StringBuilder();
-//						for(int n = 0; n < rows; n++)
-//							sb.append(" " + tmp2[(int) (rows * m + n)]);
-//						System.out.println(sb.toString());
-//					}
 
-					sides[j] = inputs.get(i).getNnz();
-					sides[j + 1] = cols;
-					sides[j + 2] = rows;
-					sides[j + 5] = GPUObject.getPointerAddress(ptr);
-					
-				} else {
-					sides[j] = inputs.get(i).getNnz();
-					sides[j + 1] = inputs.get(i).getNumRows();
-					sides[j + 2] = inputs.get(i).getNumColumns();
-					sides[j + 5] = ec.getGPUDensePointerAddress(inputs.get(i));
-				}
+	default void prepareMatrixPointers(ByteBuffer buf, ExecutionContext ec, MatrixObject mo, boolean tB1) {
+		if(mo.getGPUObject(ec.getGPUContext(0)).isSparse()) {
+			writeMatrixDescriptorToBuffer(buf, (int)mo.getNumRows(), (int)mo.getNumColumns(),
+				GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).rowPtr),
+					GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).colInd),
+						GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).val),
+							ec.getGPUSparsePointerAddress(mo).nnz);
+		}
+		else {
+			if(tB1) {
+				int rows = (int)mo.getNumRows();
+				int cols = (int)mo.getNumColumns();
+				Pointer b1 = mo.getGPUObject(ec.getGPUContext(0)).getDensePointer();
+				Pointer ptr = ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType);
+				LibMatrixCUDA.denseTranspose(ec, ec.getGPUContext(0), getName(), b1, ptr, rows, cols);
+				writeMatrixDescriptorToBuffer(buf, rows, cols, 0, 0, GPUObject.getPointerAddress(ptr), mo.getNnz());
+			} else {
+				writeMatrixDescriptorToBuffer(buf, (int)mo.getNumRows(), (int)mo.getNumColumns(), 0, 0,
+					ec.getGPUDensePointerAddress(mo), mo.getNnz());
 			}
 		}
-		return sides;
 	}
-	
-	default long[] prepareOutputPointers(ExecutionContext ec, MatrixObject output, boolean sparseOut) {
-		long[] out = {0,0,0,0,0,0};
 
-		if(sparseOut) {
-			out[0] = ec.getGPUSparsePointerAddress(output).nnz;
-			out[1] = output.getNumRows();
-			out[2] = output.getNumColumns();
-			out[3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).rowPtr);
-			out[4] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).colInd);
-			out[5] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).val);
+	default void packDataForTransfer(ExecutionContext ec, ArrayList<MatrixObject> inputs,
+		ArrayList<ScalarObject> scalarObjects, MatrixObject out_obj, int num_inputs, int ID, long grix, boolean tB1,
+			Pointer[] ptr)
+	{
+		int op_data_size = (inputs.size() + 1) * JNI_MAT_ENTRY_SIZE + scalarObjects.size() * Double.BYTES + TRANSFERRED_DATA_HEADER_SIZE;
+		Pointer staging = new Pointer();
+		if(SpoofOperator.getNativeStagingBuffer(staging, this.getContext(), op_data_size) != cudaSuccess)
+			throw new RuntimeException("Failed to get native staging buffer from spoof operator");
+		ByteBuffer buf = staging.getByteBuffer();
+		buf.putInt(op_data_size);
+		buf.putInt(ID);
+		buf.putInt((int)grix);
+		buf.putInt(num_inputs);
+		buf.putInt(inputs.size() - num_inputs);
+		buf.putInt(out_obj == null ? 0 : 1);
+		buf.putInt(scalarObjects.size());
+		buf.putInt(-1); // padding
+
+		// copy input & side input pointers
+		for(int i=0; i < inputs.size(); i++) {
+			if(i == num_inputs)
+				prepareMatrixPointers(buf, ec, inputs.get(i), tB1);
+			else
+				prepareMatrixPointers(buf, ec, inputs.get(i), false);
 		}
-		else {
-			out[0] = output.getNnz();
-			out[1] = output.getNumRows();
-			out[2] = output.getNumColumns();
-			out[5] = ec.getGPUDensePointerAddress(output);
+
+		// copy output pointers or allocate buffer for reduction
+ 		if(out_obj == null) {
+			long num_blocks = 1;
+			if(this instanceof SpoofCUDACellwise) {
+				int NT = 256;
+				long N = inputs.get(0).getNumRows() * inputs.get(0).getNumColumns();
+				num_blocks = ((N + NT * 2 - 1) / (NT * 2));
+			}
+			ptr[0] = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType * num_blocks);
+			writeMatrixDescriptorToBuffer(buf, 1, 1, 0, 0, GPUObject.getPointerAddress(ptr[0]), 1);
+		}
+ 		else {
+			prepareMatrixPointers(buf, ec, out_obj, false);
+		}
+
+ 		// copy scalar values (no pointers)
+		for(ScalarObject scalarObject : scalarObjects) {
+			buf.putDouble(scalarObject.getDoubleValue());
 		}
-		return out;
 	}
-	
-	MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs, 
+
+	MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 			ArrayList<ScalarObject> scalarObjects, String outputName);
 	
 	ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 		ArrayList<ScalarObject> scalarObjects);
-	
-	int execute_sp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
-	int execute_dp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
+
+	int execute_dp(long ctx);
+	int execute_sp(long ctx);
+	long getContext();
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
index b5686c7dc85..2632faeb84f 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
@@ -23,11 +23,11 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
 import java.util.ArrayList;
@@ -37,55 +37,40 @@ public class SpoofCUDARowwise extends SpoofRowwise implements SpoofCUDAOperator
 	private static final Log LOG = LogFactory.getLog(SpoofCUDARowwise.class.getName());
 	private final int ID;
 	private final PrecisionProxy call;
-	private Pointer ptr;
+	private final long ctx;
 	
 	public SpoofCUDARowwise(RowType type,  long constDim2, boolean tB1, int reqVectMem, int id,
-		PrecisionProxy ep) {
+		PrecisionProxy ep)
+	{
 		super(type, constDim2, tB1, reqVectMem);
 		ID = id;
 		call = ep;
-		ptr = null;
+		ctx = SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);
 	}
 	
 	@Override public String getName() {
 		return getSpoofType();
 	}
-	
-	@Override public void setScalarPtr(Pointer _ptr) {
-		ptr = _ptr;
-	}
-	
-	@Override public Pointer getScalarPtr() {
-		return ptr;
-	}
-	
-	@Override public void releaseScalarGPUMemory(ExecutionContext ec) {
-		if(ptr != null) {
-			ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr, DMLScript.EAGER_CUDA_FREE);
-			ptr = null;
-		}
-	}
-	
+
 	@Override
 	public ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
-		ArrayList<ScalarObject> scalarObjects) {
+		ArrayList<ScalarObject> scalarObjects)
+	{
 		double[] result = new double[1];
-		Pointer ptr = ec.getGPUContext(0).allocate(getName(), LibMatrixCUDA.sizeOfDataType);
-		long[] out = {1,1,1, 0, 0, GPUObject.getPointerAddress(ptr)};
-		int offset = 1;
-		if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, offset), prepareSideInputPointers(ec, inputs, offset, _tB1),
-				out, scalarObjects, 0) != 0) {
-			LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.\n");
-			// ToDo: java fallback
-		}
-		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, result, getName(), false);
+		Pointer[] ptr = new Pointer[1];
+		packDataForTransfer(ec, inputs, scalarObjects, null, 1, ID, 0,_tB1, ptr);
+		if(call.exec(this) != 0)
+			LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
+
+		LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr[0], result, getName(), false);
+		ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr[0], DMLScript.EAGER_CUDA_FREE);
 		return new DoubleObject(result[0]);
 	}
 	
 	@Override
 	public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
-		ArrayList<ScalarObject> scalarObjects, String outputName) {
-		
+		ArrayList<ScalarObject> scalarObjects, String outputName)
+	{
 		int m = (int) inputs.get(0).getNumRows();
 		int n = (int) inputs.get(0).getNumColumns();
 		final int n2 = _type.isConstDim2(_constDim2) ? (int)_constDim2 : _type.isRowTypeB1() ||
@@ -93,13 +78,12 @@ public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 		OutputDimensions out_dims = new OutputDimensions(m, n, n2);
 		ec.setMetaData(outputName, out_dims.rows, out_dims.cols);
 		MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, out_dims.cols).getKey();
-		
-		int offset = 1;
-		if(call.exec(ec,this, ID, prepareInputPointers(ec, inputs, offset), prepareSideInputPointers(ec, inputs, 
-				offset, _tB1), prepareOutputPointers(ec, out_obj, false), scalarObjects, 0) != 0) {
-			LOG.error("SpoofCUDA " + getSpoofType() + " operator failed to execute. Trying Java fallback.\n");
-			// ToDo: java fallback
-		}
+
+		packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 0,_tB1, null);
+
+		if(call.exec(this) != 0)
+			LOG.error("SpoofCUDA " + getSpoofType() + " operator " + ID + " failed to execute!\n");
+
 		return out_obj;
 	}
 	
@@ -110,15 +94,11 @@ public MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> inputs,
 	// unused
 	@Override protected void genexec(double[] avals, int[] aix, int ai, SideInput[] b, double[] scalars, double[] c,
 		int ci, int alen, int n, long grix, int rix) { }
-	
-	public int execute_sp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_f(ctx, meta, in, sides, out, scalars);
-	}
-	
-	public int execute_dp(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars) {
-		return execute_d(ctx, meta, in, sides, out, scalars);
-	}
-	
-	public static native int execute_f(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
-	public static native int execute_d(long ctx, long[] meta, long[] in, long[] sides, long[] out, long scalars);
+
+	public int execute_dp(long ctx) { return execute_d(ctx); }
+	public int execute_sp(long ctx) { return execute_d(ctx); }
+	public long getContext() { return ctx; }
+
+	public static native int execute_d(long ctx);
+	public static native int execute_s(long ctx);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
index fe8d932c448..f2700911fab 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
@@ -23,6 +23,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import jcuda.Pointer;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
@@ -321,4 +322,6 @@ public void reset() {
 			currColPos = 0;
 		}
 	}
+
+	public static native int getNativeStagingBuffer(Pointer ptr, long context, int size);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
index 9fd1bcfa481..b54d6fe4f94 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
@@ -35,7 +35,6 @@
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageItem;
 import org.apache.sysds.runtime.lineage.LineageItemUtils;
 import org.apache.sysds.utils.GPUStatistics;
@@ -52,21 +51,14 @@ public class SpoofCUDAInstruction extends GPUInstruction {
 	public final CPOperand _out;
 	
 	public static class SinglePrecision extends SpoofCUDAOperator.PrecisionProxy {
-		public int exec(ExecutionContext ec, SpoofCUDAOperator op, int opID, long[] in, long[] sides, long[] out,
-				ArrayList<ScalarObject> scalarObjects, long grix) {
-			op.setScalarPtr(transferScalars(ec, op, Sizeof.FLOAT, scalarObjects));
-			long[] _metadata = { opID, grix, in.length, sides.length, out.length, scalarObjects.size() };
-			return op.execute_sp(ctx, _metadata, in, sides, out, GPUObject.getPointerAddress(op.getScalarPtr()));
+		public int exec(SpoofCUDAOperator op) {
+			return op.execute_sp(ctx);
 		}
 	}
 	
 	public static class DoublePrecision extends SpoofCUDAOperator.PrecisionProxy {
-		public int exec(ExecutionContext ec, SpoofCUDAOperator op, int opID, long[] in, long[] sides, long[] out,
-				ArrayList<ScalarObject> scalarObjects, long grix) {
-			if(!scalarObjects.isEmpty())
-				op.setScalarPtr(transferScalars(ec, op, Sizeof.DOUBLE, scalarObjects));
-			long[] _metadata = { opID, grix, in.length, sides.length, out.length, scalarObjects.size() };
-			return op.execute_dp(ctx, _metadata, in, sides, out, GPUObject.getPointerAddress(op.getScalarPtr()));
+		public int exec(SpoofCUDAOperator op) {
+			return op.execute_dp(ctx);
 		}
 	}
 	
@@ -101,7 +93,6 @@ public static SpoofCUDAInstruction parseInstruction(String str) {
 		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
 
 		ArrayList<CPOperand> inlist = new ArrayList<>();
-//		Integer op_id =  CodegenUtils.getCUDAopID(parts[2].split("\\.")[1]);
 		Integer op_id = CodegenUtils.getCUDAopID(parts[2]);
 		Class<?> cla = CodegenUtils.getClass(parts[2]);
 		SpoofOperator fallback_java_op = CodegenUtils.createInstance(cla);
@@ -141,11 +132,9 @@ else if(_out.getDataType() == Types.DataType.SCALAR) {
 				ScalarObject out = _op.execute(ec, inputs, scalars);
 				ec.setScalarOutput(_out.getName(), out);
 			}
-			
-			_op.releaseScalarGPUMemory(ec);
 		}
 		catch(Exception ex) {
-			LOG.error("SpoofCUDAInstruction: " + _op.getName() + " operator failed to execute. Trying Java fallback.(ToDo)\n");
+			LOG.error("SpoofCUDAInstruction: " + _op.getName() + " operator failed to execute :(\n");
 			
 			throw new DMLRuntimeException(ex);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index efbf2011bd2..633e3fc7d26 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -1168,4 +1168,5 @@ public long getDensePointerAddress() {
 	
 	public static long getPointerAddress(Pointer p) {
 		return (p == null) ?  0 : getPointerAddressInternal(p);
-	}}
+	}
+}