<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring_2024/blob/main/Week14/SpMV_Formats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-yd7dgvab
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-yd7dgvab
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4305 sha256=942dbf3699e25ba9717b5dd71d6d5c605ad70b12727e62ffc46e22bf53182f7b
  Stored in directory: /tmp/pip-ephem-wheel-cache-s_ispawi/wheels/db/c1/1f/a2bb07bbb4a1ce3c43921252aeafaa6205f08637e292496f04
Successfully built NVCCPlugin
Installing collecte

In [None]:
%%cuda
#include <stdio.h>

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

// Parallel SpMV using CSR format
__global__ void spmv_csr(int num_rows, int *data, int *col_index, int* row_ptr, int *x, int *y)
 {
    int row = threadIdx.x + blockIdx.x * blockDim.x;

		if( row < num_rows)
		{
				int dotProduct = 0;
        const int row_start = row_ptr[row];
        const int row_end = row_ptr[row + 1];

        for (int j = row_start; j < row_end; j++)
				{
            dotProduct += data[j] * x[col_index[j]];
        }

        y[row] = dotProduct;
    }
}

// Parallel SpMV using ELL format
__global__ void spmv_ell(int num_rows, int *data, int *col_index, int num_elements, int *x, int *y)
 {
		int row = threadIdx.x + blockIdx.x * blockDim.x;
		if( row < num_rows)
		{
				int dotProduct = 0;

        for (int j = 0; j < num_elements ; j++)
				{
            dotProduct += data[row+ j*num_rows] * x[col_index[row + j*num_rows]];
        }

        y[row] = dotProduct;
    }
}

// Parallel SpMV using COO format
__global__ void spmv_coo(int num_rows, int *data, int *col_index, int* row_index,int num_elements, int *x, int *y)
 {
    int row = threadIdx.x + blockIdx.x * blockDim.x;

		if( row < num_rows)
		{
				int dotProduct = 0;

        for (int j = 0; j < num_elements; j++)
				{
            dotProduct += data[row_index[j]] * x[col_index[j]];
        }

        y[row] = dotProduct;
    }
}

int main()
{
	//data for CSR and COO formats
	int data[7] = {3, 1, 2, 4, 1, 1, 1};
	int row_ptr[5] = {0, 2, 2, 5, 7};
	int col_index[7] = {0, 2, 1, 2, 3, 0, 3};
	int row_index[7] = {0, 0, 2, 2, 2, 3, 3};

	//data for ELL format
	int dataPaddedTransposed[12]={3, 0, 2, 1, 1, 0, 4, 1, 0, 0, 1, 0 };
	int col_indexPaddedTransposed[12] = { 0, 0, 1, 0, 2, 0, 2, 3, 0, 0, 3, 0};

	const int N = 4;
	int* dataIn;
	int* dataInELL;
	int* colIndex;
	int* rowIndex;
	int* colIndexELL;
	int* rowPtr;
	int* x;
	int* y;

	// Allocate Unified Memory -- accessible from CPU or GPU
	checkCudaErr(cudaMallocManaged(&x, N*sizeof(int)), "cudaMallocManaged1");
	checkCudaErr(cudaMallocManaged(&y, N*sizeof(int)), "cudaMallocManaged2");
	checkCudaErr(cudaMallocManaged(&dataIn, 7*sizeof(int)), "cudaMallocManaged3");
	checkCudaErr(cudaMallocManaged(&colIndex, 7*sizeof(int)), "cudaMallocManaged4");
	checkCudaErr(cudaMallocManaged(&rowPtr, 5*sizeof(int)), "cudaMallocManaged5");
	checkCudaErr(cudaMallocManaged(&dataInELL, 12*sizeof(int)), "cudaMallocManaged6");
	checkCudaErr(cudaMallocManaged(&colIndexELL, 12*sizeof(int)), "cudaMallocManaged7");
	checkCudaErr(cudaMallocManaged(&rowIndex, 7*sizeof(int)), "cudaMallocManaged8");

	// fill in the memory with data
	for (int i=0; i<12; i++) {
		if(i<N)
		{
			x[i] = i+1;
			y[i] = 0;
		}
		if(i<5)
			rowPtr[i] = row_ptr[i];

		if(i<7)
		{
			dataIn[i] = data[i];
			colIndex[i] = col_index[i];
			rowIndex[i] = row_index[i];
		}

		dataInELL[i] = dataPaddedTransposed[i];
		colIndexELL[i] = col_indexPaddedTransposed[i];
	}

  int blocksPerGrid = 1;
	int threadsPerBlock = 32;

	// call the CSR kernel
  spmv_csr<<<blocksPerGrid, threadsPerBlock>>>(N, dataIn, colIndex, rowPtr, x, y);
	cudaDeviceSynchronize();
	cudaError_t err = cudaGetLastError();
	if(err != 0)
		 printf("CUDA Error: %s", cudaGetErrorString(err));

	puts("Output using CSR Format: ");
	for(int i=0; i<N; ++i)
 	{
     printf("x[i]: %d, y[i]: %d\n", x[i], y[i]);
		 y[i] = 0; //preparing for next kernel
	}

	//now lets try the ELL format
	spmv_ell<<<blocksPerGrid, threadsPerBlock>>>(N, dataInELL, colIndexELL, 12, x, y);
	cudaDeviceSynchronize();
	err = cudaGetLastError();
	if(err != 0)
		 printf("CUDA Error: %s", cudaGetErrorString(err));

	puts("\nOutput using ELL Format: ");
	for(int i=0; i<N; ++i)
 	{
     printf("x[i]: %d, y[i]: %d\n", x[i], y[i]);
		 y[i] = 0; //preparing for next kernel
	}

	//now lets try the COO format
	spmv_coo<<<blocksPerGrid, threadsPerBlock>>>(N, dataIn, colIndex, rowIndex, 7, x, y);
	cudaDeviceSynchronize();
	err = cudaGetLastError();
	if(err != 0)
		 printf("CUDA Error: %s", cudaGetErrorString(err));

	puts("\nOutput using COO Format: ");
	for(int i=0; i<N; ++i)
 	{
     printf("x[i]: %d, y[i]: %d\n", x[i], y[i]);
	}

	// free memory on the gpu side
	checkCudaErr( cudaFree( x ), "cudaFree1");
	checkCudaErr( cudaFree( y ), "cudaFree2");
	checkCudaErr( cudaFree( dataIn ), "cudaFree3");
	checkCudaErr( cudaFree( colIndex ), "cudaFree4");
	checkCudaErr( cudaFree( rowPtr ), "cudaFree5");
	checkCudaErr( cudaFree( dataInELL ), "cudaFree6");
	checkCudaErr( cudaFree( colIndexELL ), "cudaFree7");
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Output using CSR Format: 
x[i]: 1, y[i]: 6
x[i]: 2, y[i]: 0
x[i]: 3, y[i]: 20
x[i]: 4, y[i]: 5

Output using ELL Format: 
x[i]: 1, y[i]: 6
x[i]: 2, y[i]: 0
x[i]: 3, y[i]: 20
x[i]: 4, y[i]: 5

Output using COO Format: 
x[i]: 1, y[i]: 50
x[i]: 2, y[i]: 50
x[i]: 3, y[i]: 50
x[i]: 4, y[i]: 50

