Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 7 additions & 35 deletions src/main/cuda/headers/Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,19 @@ using int32_t = int;

template <typename T>
struct Matrix {
int32_t nnz;
uint64_t nnz;
uint32_t rows;
uint32_t cols;

uint32_t* row_ptr;
uint32_t* col_idx;
T* data;

typedef T value_type;

explicit Matrix(size_t* jvals) : nnz(jvals[0]), rows(jvals[1]), cols(jvals[2]),
row_ptr(reinterpret_cast<uint32_t*>(jvals[3])),
col_idx(reinterpret_cast<uint32_t*>((jvals[4]))), data(reinterpret_cast<T*>(jvals[5])) {}
explicit Matrix(uint8_t* jvals) : nnz(*reinterpret_cast<uint32_t*>(&jvals[0])),
rows(*reinterpret_cast<uint32_t*>(&jvals[8])), cols(*reinterpret_cast<uint32_t*>(&jvals[12])),
row_ptr(reinterpret_cast<uint32_t*>(jvals[16])), col_idx(reinterpret_cast<uint32_t*>((jvals[24]))),
data(static_cast<T*>(jvals[32])) {}
};

#ifdef __CUDACC__
Expand Down Expand Up @@ -72,7 +72,7 @@ class MatrixAccessor {

__device__ void init(Matrix<T>* mat) { _mat = mat; }

__device__ uint32_t& nnz() { return _mat->nnz; }
__device__ uint32_t& nnz() { return return _mat->row_ptr == nullptr ? _mat->rows * _mat->cols : _mat->nnz; }
__device__ uint32_t cols() { return _mat->cols; }
__device__ uint32_t rows() { return _mat->rows; }

Expand Down Expand Up @@ -133,7 +133,7 @@ class MatrixAccessor {

//ToDo sparse accessors
__device__ uint32_t len_sparse() {
return _mat->nnz;
return _mat->row_ptr[_mat->rows];
}

__device__ uint32_t pos_sparse(uint32_t rix) {
Expand Down Expand Up @@ -227,34 +227,6 @@ class RingBuffer {
}
};

template <typename T, int NUM_B>
struct SpoofOp {
MatrixAccessor<T> a;
MatrixAccessor<T> b[NUM_B];
MatrixAccessor<T> c;
T* scalars;
uint32_t grix;
T* avals;
uint32_t* aix;
uint32_t alen;

SpoofOp(Matrix<T>* A, Matrix<T>* B, Matrix<T>* C, T* scalars, T* tmp_stor, uint32_t grix) :
scalars(scalars), grix(grix), avals(A->data), aix(A->col_idx) {
a.init(A);
c.init(C);
alen = a.row_len(grix);

if(B)
for(auto i = 0; i < NUM_B; ++i)
b[i].init(&(B[i]));
}

// __device__ Vector<T>& getTempStorage(uint32_t len) {
// Vector<T>& vec = temp_rb.next();
// tvec.length = len;
// return vec;
// }
};
#endif // __CUDACC_RTC__

#endif //SYSTEMDS_MATRIX_H
52 changes: 9 additions & 43 deletions src/main/cuda/headers/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#define REDUCTION_CUH

using uint = unsigned int;

#include <cuda_runtime.h>

#include "utils.cuh"
Expand Down Expand Up @@ -51,7 +52,9 @@ using uint = unsigned int;
* @param SpoofCellwiseOp initial value for the reduction variable
*/
template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) {
__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op,
SpoofCellwiseOp spoof_op)
{
auto sdata = shared_memory_proxy<T>();

// perform first level of reduction,
Expand All @@ -66,12 +69,9 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < N) {
// printf("tid=%d i=%d N=%d, in->cols()=%d rix=%d\n", threadIdx.x, i, N, in->cols(), i/in->cols());
v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / in->cols(), i % in->cols()));

if (i + blockDim.x < N) {
//__syncthreads();
//printf("loop fetch i(%d)+blockDim.x(%d)=%d, in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]);
v = reduction_op(v, spoof_op(*(in->vals(i+blockDim.x)), blockDim.x + i, (i+blockDim.x) / in->cols(), (i+blockDim.x) % in->cols()));
}

Expand Down Expand Up @@ -116,40 +116,25 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
if (blockDim.x >= 64) {
smem[tid] = v = reduction_op(v, smem[tid + 32]);
}
// if(tid<12)
// printf("bid=%d tid=%d reduction result: %3.1f\n", blockIdx.x, tid, sdata[tid]);

if (blockDim.x >= 32) {
smem[tid] = v = reduction_op(v, smem[tid + 16]);
}
// if(tid==0)
// printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
if (blockDim.x >= 16) {
smem[tid] = v = reduction_op(v, smem[tid + 8]);
}
// if(tid==0)
// printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
if (blockDim.x >= 8) {
smem[tid] = v = reduction_op(v, smem[tid + 4]);
}
// if(tid==0)
// printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
if (blockDim.x >= 4) {
smem[tid] = v = reduction_op(v, smem[tid + 2]);
}
// if(tid==0)
// printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
if (blockDim.x >= 2) {
smem[tid] = v = reduction_op(v, smem[tid + 1]);
}
// if(tid==0)
// printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
}

// write result for this block to global mem
if (tid == 0) {
// if(gridDim.x < 10)
// printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]);
out->val(0, blockIdx.x) = sdata[0];
}
}
Expand All @@ -174,19 +159,10 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
* the value before writing it to its final location in global memory for each
* row
*/
//template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
//__device__ void ROW_AGG(
// T *g_idata, ///< input data stored in device memory (of size rows*cols)
// T *g_odata, ///< output/temporary array store in device memory (of size
// /// rows*cols)
// uint rows, ///< rows in input and temporary/output arrays
// uint cols, ///< columns in input and temporary/output arrays
// T initialValue, ///< initial value for the reduction variable
// ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
// SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this
template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
__device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op,
SpoofCellwiseOp spoof_op) {
SpoofCellwiseOp spoof_op)
{
auto sdata = shared_memory_proxy<T>();

// one block per row
Expand All @@ -199,7 +175,6 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
uint32_t i = tid;
uint block_offset = block * in->cols();

// T v = initialValue;
T v = reduction_op.init();
while (i < in->cols()) {
v = reduction_op(v, spoof_op(in->val(block_offset + i), i, i / in->cols(), i % in->cols()));
Expand Down Expand Up @@ -283,16 +258,8 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t
*/
template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
__device__ void COL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op,
SpoofCellwiseOp spoof_op) {
//__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols)
// T *g_odata, ///< output/temporary array store in device memory (of size rows*cols)
// uint rows, ///< rows in input and temporary/output arrays
// uint cols, ///< columns in input and temporary/output arrays
// T initialValue, ///< initial value for the reduction variable
// ReductionOp reduction_op, ///< Reduction operation to perform (functor object)
// SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation
//
//{
SpoofCellwiseOp spoof_op)
{
uint global_tid = blockIdx.x * blockDim.x + threadIdx.x;
if (global_tid >= in->cols()) {
return;
Expand All @@ -315,13 +282,12 @@ __device__ void NO_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N
uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t first_idx = gtid * static_cast<uint32_t>(VT);
uint32_t last_idx = min(first_idx + static_cast<uint32_t>(VT), N);

#pragma unroll
for(auto i = first_idx; i < last_idx; i++) {
T a = in->hasData() ? in->vals(0)[i] : 0;
T result = spoof_op(a, i, i / in->cols(), i % in->cols());
out->vals(0)[i] = result;
//if(i < 4)
// printf("tid=%d in=%4.3f res=%4.3f out=%4.3f r=%d\n", i, in->vals(0)[i], result, out->vals(0)[i], i/in->cols());
}
}

Expand Down
19 changes: 17 additions & 2 deletions src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,15 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resourc
CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG, SpoofOperator::AggOp::MAX), func));

CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(ctx->staging_buffer)), ctx->default_mem_size));
CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(ctx->device_buffer)), ctx->default_mem_size));
ctx->current_mem_size = ctx->default_mem_size;
return reinterpret_cast<size_t>(ctx);
}

void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]] uint32_t device_id) {
cudaFreeHost(ctx->staging_buffer);
cudaFree(ctx->device_buffer);
delete ctx;
// cuda device is handled by jCuda atm
//cudaDeviceReset();
Expand Down Expand Up @@ -116,15 +121,25 @@ size_t SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const std::s

template<typename T>
CUfunction SpoofCUDAContext::getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp> &key) {
return nullptr;
return nullptr; // generic case never used
}

template<>
CUfunction SpoofCUDAContext::getReductionKernel<float>(const std::pair<SpoofOperator::AggType,
SpoofOperator::AggOp> &key) {
return reduction_kernels_f[key];
}

template<>
CUfunction SpoofCUDAContext::getReductionKernel<double>(const std::pair<SpoofOperator::AggType,
SpoofOperator::AggOp> &key) {
return reduction_kernels_d[key];
}
}

void SpoofCUDAContext::resize_staging_buffer(size_t size) {
cudaFreeHost(staging_buffer);
cudaFree(device_buffer);
CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(staging_buffer)), size));
CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(device_buffer)), size));
current_mem_size = size;
}
62 changes: 18 additions & 44 deletions src/main/cuda/spoof-launcher/SpoofCUDAContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,69 +46,43 @@ class SpoofCUDAContext {
std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_f;
std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, CUfunction> reduction_kernels_d;

// double handling_total, compile_total;

const std::string resource_path;
const std::vector<std::string> include_paths;

public:
size_t default_mem_size = 1024; // 1kb for hosting data pointers, scalars and some meta info. This default should
// not require resizing these buffers in most cases.
size_t current_mem_size = 0; // the actual staging buffer size (should be default unless there was a resize)
std::byte* staging_buffer{}; // pinned host mem for async transfers
std::byte* device_buffer{}; // this buffer holds the pointers to the data buffers

explicit SpoofCUDAContext(const char* resource_path_, std::vector<std::string> include_paths_) : reductions(nullptr),
resource_path(resource_path_), include_paths(std::move(include_paths_))
//,handling_total(0.0), compile_total(0.0)
{}
resource_path(resource_path_), include_paths(std::move(include_paths_)) { }

static size_t initialize_cuda(uint32_t device_id, const char* resource_path_);

static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);

size_t compile(std::unique_ptr<SpoofOperator> op, const std::string &src);

template <typename T, typename CALL>
int launch(uint32_t opID, std::vector<Matrix<T>>& input, std::vector<Matrix<T>>& sides, Matrix<T>& output,
T* scalars, uint32_t grix) {
// dp holds in/side/out/scalar pointers for GPU
DevMatPtrs<T> dp;

SpoofOperator* op = compiled_ops[opID].get();

CHECK_CUDART(cudaMalloc((void **)&dp.in, sizeof(Matrix<T>) * input.size()));
CHECK_CUDART(cudaMemcpy(dp.in, reinterpret_cast<void*>(&input[0]), sizeof(Matrix<T>) * input.size(),
cudaMemcpyHostToDevice));

if (!sides.empty()) {
CHECK_CUDART(cudaMalloc(reinterpret_cast<void **>(&dp.sides), sizeof(Matrix<T>) * sides.size()));
CHECK_CUDART(cudaMemcpy(dp.sides, &sides[0], sizeof(Matrix<T>) * sides.size(), cudaMemcpyHostToDevice));
}

if (op->isSparseSafe() && input.front().row_ptr != nullptr) {
CHECK_CUDART(cudaMemcpy(output.row_ptr, input.front().row_ptr, (input.front().rows+1)*sizeof(uint32_t),
cudaMemcpyDeviceToDevice));
}
#ifndef NDEBUG
std::cout << "output rows: " << output.rows << " cols: " << output.cols << " nnz: " << output.nnz << " format: " <<
(output.row_ptr == nullptr ? "dense" : "sparse") << std::endl;
#endif
size_t out_num_elements = output.rows * output.cols;
if(output.row_ptr)
if(op->isSparseSafe() && output.nnz > 0)
out_num_elements = output.nnz;
CHECK_CUDART(cudaMalloc((void **) &dp.out, sizeof(Matrix<T>)));
CHECK_CUDART(cudaMemset(output.data, 0, out_num_elements * sizeof(T)));
CHECK_CUDART(cudaMemcpy(dp.out, reinterpret_cast<void *>(&output), sizeof(Matrix<T>),
cudaMemcpyHostToDevice));

dp.scalars = scalars;

CALL::exec(this, op, input, sides, output, grix, dp);

int launch() {

DataBufferWrapper dbw(staging_buffer, device_buffer);
SpoofOperator* op = compiled_ops[dbw.op_id()].get();
dbw.toDevice(op->stream);

CALL::exec(this, op, &dbw);

return 0;
}

std::string getOperatorName(uint32_t opID) { return compiled_ops.at(opID)->name; }

template<typename T>
CUfunction getReductionKernel(const std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>& key);

void resize_staging_buffer(size_t size);
};

#endif // SPOOFCUDACONTEXT_H
Loading