In [1]:
! pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpll84i2d5".


In [4]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

#define N 512

#define SPARSITY_THRESHOLD 0.3

__global__ void spmv_coo(float* matrix, float* vector, int* row_indices, int* col_indices, float* values, float* result, int nnz) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < nnz) {
        int row = row_indices[tid];
        int col = col_indices[tid];
        float val = values[tid];
        atomicAdd(&result[row], val * vector[col]);
    }
}

void convert_to_coo(float* matrix, int* row_indices, int* col_indices, float* values, int* nnz) {
    int count = 0;
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if (matrix[i * N + j] != 0) {
                row_indices[count] = i;
                col_indices[count] = j;
                values[count] = matrix[i * N + j];
                count++;
            }
        }
    }
    *nnz = count;
}

int main() {
    int i, j;
    float matrix[N][N];
    float vector[N];
    // Initialize random seed
    srand(time(NULL));

    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            if ((float)rand() / RAND_MAX < SPARSITY_THRESHOLD) {
                matrix[i][j] = rand() % 10 + 1; // Random value between 1 and 10
            } else {
                matrix[i][j] = 0;
            }
        }
    }
    for (i = 0; i < N; i++) {
        vector[i] = i + 1;
    }


    int row_indices[N * N];
    int col_indices[N * N];
    float values[N * N];
    int nnz;


    convert_to_coo((float*)matrix, row_indices, col_indices, values, &nnz);


    float* d_matrix;
    float* d_vector;
    int* d_row_indices;
    int* d_col_indices;
    float* d_values;
    float* d_result;
    cudaMalloc((void**)&d_matrix, N * N * sizeof(float));
    cudaMalloc((void**)&d_vector, N * sizeof(float));
    cudaMalloc((void**)&d_row_indices, nnz * sizeof(int));
    cudaMalloc((void**)&d_col_indices, nnz * sizeof(int));
    cudaMalloc((void**)&d_values, nnz * sizeof(float));
    cudaMalloc((void**)&d_result, N * sizeof(float));

    cudaMemcpy(d_matrix, matrix, N * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector, vector, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_row_indices, row_indices, nnz * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_col_indices, col_indices, nnz * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice);


    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    cudaEventRecord(start);


    spmv_coo<<<N,N>>>(d_matrix, d_vector, d_row_indices, d_col_indices, d_values, d_result, nnz);

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    float result[N];
    cudaMemcpy(result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost);


    printf("Result: ");
    for (int i = 0; i < N; i++) {
        printf("%.2f ", result[i]);
    }
    printf("\n");


    printf("Runtime: %.2f ms\n", milliseconds);


    cudaFree(d_matrix);
    cudaFree(d_vector);
    cudaFree(d_row_indices);
    cudaFree(d_col_indices);
    cudaFree(d_values);
    cudaFree(d_result);


    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Result: 249456.00 209795.00 219296.00 221307.00 172656.00 210598.00 233654.00 222259.00 202080.00 197382.00 216336.00 235412.00 201647.00 236479.00 212755.00 189951.00 242636.00 213788.00 238201.00 174176.00 226624.00 234385.00 204100.00 192583.00 167690.00 197616.00 225586.00 196905.00 191971.00 214108.00 245339.00 239336.00 228949.00 205670.00 219042.00 214280.00 228183.00 187774.00 208270.00 198855.00 233879.00 251126.00 166107.00 218442.00 217918.00 234405.00 229523.00 195989.00 225311.00 171873.00 219104.00 233212.00 225017.00 224150.00 222299.00 221503.00 194046.00 195318.00 252998.00 233238.00 244216.00 222952.00 257909.00 230768.00 223227.00 227321.00 193345.00 206560.00 208345.00 230432.00 206182.00 259713.00 208858.00 224012.00 202402.00 227373.00 215793.00 225212.00 209191.00 225405.00 188529.00 214797.00 242476.00 203068.00 165769.00 244145.00 210651.00 247487.00 230404.00 258455.00 225319.00 232502.00 225094.00 204448.00 242132.00 219453.00 229776.00 243511.00 254696.00 21

In [6]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

#define N 512
#define SPARSITY_THRESHOLD 0.3

__global__ void spmv_csr(float* matrix, float* vector, int* row_ptr, int* col_indices, float* values, float* result) {
    int row = blockIdx.x;
    int start = row_ptr[row];
    int end = row_ptr[row + 1];
    float sum = 0.0;

    for (int i = start; i < end; i++) {
        int col = col_indices[i];
        sum += values[i] * vector[col];
    }

    result[row] = sum;
}

void convert_to_csr(float* matrix, int* row_ptr, int* col_indices, float* values, int* nnz) {
    int count = 0;
    row_ptr[0] = 0;

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if (matrix[i * N + j] != 0) {
                col_indices[count] = j;
                values[count] = matrix[i * N + j];
                count++;
            }
        }
        row_ptr[i + 1] = count;
    }

    *nnz = count;
}



int main() {

    int i, j;
    float matrix[N][N];
    float vector[N];
    // Initialize random seed
    srand(time(NULL));

    // Generate sparse matrix
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            if ((float)rand() / RAND_MAX < SPARSITY_THRESHOLD) {
                matrix[i][j] = rand() % 10 + 1; // Random value between 1 and 10
            } else {
                matrix[i][j] = 0;
            }
        }
    }
    for (i = 0; i < N; i++) {
        vector[i] = i+1;
    }


    int row_ptr[N + 1];
    int col_indices[N * N];
    float values[N * N];
    int nnz;


    float* d_matrix;
    float* d_vector;
    int* d_row_ptr;
    int* d_col_indices;
    float* d_values;
    float* d_result;
    cudaMalloc((void**)&d_matrix, N * N * sizeof(float));
    cudaMalloc((void**)&d_vector, N * sizeof(float));
    cudaMalloc((void**)&d_row_ptr, (N + 1) * sizeof(int));
    cudaMalloc((void**)&d_col_indices, N * N * sizeof(int));
    cudaMalloc((void**)&d_values, N * N * sizeof(float));
    cudaMalloc((void**)&d_result, N * sizeof(float));


    cudaMemcpy(d_matrix, matrix, N * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector, vector, N * sizeof(float), cudaMemcpyHostToDevice);

    convert_to_csr((float*)matrix, row_ptr, col_indices, values, &nnz);

    cudaMemcpy(d_row_ptr, row_ptr, (N + 1) * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_col_indices, col_indices, nnz * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    spmv_csr<<<N, N>>>(d_matrix, d_vector, d_row_ptr, d_col_indices, d_values, d_result);

    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    float result[N];
    cudaMemcpy(result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost);

    printf("Result: ");
    for (int i = 0; i < N; i++) {
        printf("%.2f ", result[i]);
    }
    printf("\n");

    printf("Runtime: %.2f ms\n", milliseconds);

    cudaFree(d_matrix);
    cudaFree(d_vector);
    cudaFree(d_row_ptr);
    cudaFree(d_col_indices);
    cudaFree(d_values);
    cudaFree(d_result);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}



Result: 229939.00 233788.00 248752.00 228741.00 220020.00 228398.00 190465.00 235417.00 278222.00 242061.00 225642.00 211124.00 230638.00 193658.00 199567.00 244378.00 168047.00 225839.00 194309.00 215747.00 234715.00 211794.00 217538.00 215145.00 217033.00 216237.00 276621.00 246856.00 237215.00 223573.00 202840.00 171660.00 182949.00 222869.00 223231.00 213075.00 225553.00 239261.00 198464.00 214119.00 215356.00 181255.00 240923.00 210558.00 210938.00 185456.00 214007.00 220472.00 236615.00 186268.00 236796.00 193175.00 214085.00 228713.00 205793.00 199841.00 228213.00 214057.00 192793.00 216042.00 258459.00 244582.00 211365.00 257857.00 216019.00 197702.00 233956.00 166275.00 214009.00 220740.00 219989.00 255638.00 181255.00 170634.00 208682.00 224617.00 227525.00 211370.00 249040.00 224534.00 172465.00 187910.00 217595.00 192214.00 196614.00 230839.00 232975.00 239173.00 218003.00 190926.00 237844.00 190848.00 221325.00 222801.00 188007.00 223965.00 220317.00 236497.00 247197.00 19