<a href="https://colab.research.google.com/github/atharva0300/BE-8th-Semester/blob/main/hpc_mini_project/Sparse_Matrix_Multiplication_CUDA___HPC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
pip install nvcc4jupyter



In [8]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [10]:
!update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 60 --slave /usr/bin/g++ g++ /usr/bin/g++-7


update-alternatives: error: alternative path /usr/bin/gcc-7 doesn't exist


In [11]:
!gcc --version


gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [12]:
%%cuda
#include <iostream>
#include <cuda_runtime.h>

// CUDA kernel for sparse matrix multiplication
__global__ void sparseMatrixMul(int *row_ptr, int *col_indices, float *values,
                                float *B, float *C, int num_rows, int num_cols)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < num_rows)
    {
        for (int i = row_ptr[row]; i < row_ptr[row + 1]; ++i)
        {
            int col = col_indices[i];
            float val = values[i];
            for (int k = 0; k < num_cols; ++k)
            {
                C[row * num_cols + k] += val * B[col * num_cols + k];
            }
        }
    }
}

int main()
{
    // Assume A is a sparse matrix in CSR format
    int row_ptr[4] = {0, 2, 3, 5};
    int col_indices[5] = {0, 1, 0, 1, 2};
    float values[5] = {2.0, 3.0, 4.0, 1.0, 5.0};

    // Assume B is a dense matrix
    float B[6] = {1, 2, 3, 4, 5, 6};

    int num_rows = 3;
    int num_cols = 2;

    // Allocate memory for matrices A, B, and C on the device
    int *d_row_ptr, *d_col_indices;
    float *d_values, *d_B, *d_C;
    cudaMalloc((void **)&d_row_ptr, (num_rows + 1) * sizeof(int));
    cudaMalloc((void **)&d_col_indices, 5 * sizeof(int));
    cudaMalloc((void **)&d_values, 5 * sizeof(float));
    cudaMalloc((void **)&d_B, 6 * sizeof(float));
    cudaMalloc((void **)&d_C, num_rows * num_cols * sizeof(float));

    // Copy matrices A and B to the device
    cudaMemcpy(d_row_ptr, row_ptr, (num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_col_indices, col_indices, 5 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_values, values, 5 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, 6 * sizeof(float), cudaMemcpyHostToDevice);

    // Launch the kernel
    sparseMatrixMul<<<1, num_rows>>>(d_row_ptr, d_col_indices, d_values, d_B, d_C, num_rows, num_cols);

    // Copy the result matrix C back to the host
    float C[num_rows * num_cols];
    cudaMemcpy(C, d_C, num_rows * num_cols * sizeof(float), cudaMemcpyDeviceToHost);

    // Print the result matrix C
    std::cout << "Result matrix C:" << std::endl;
    for (int i = 0; i < num_rows; ++i)
    {
        for (int j = 0; j < num_cols; ++j)
        {
            std::cout << C[i * num_cols + j] << " ";
        }
        std::cout << std::endl;
    }

    // Free device memory
    cudaFree(d_row_ptr);
    cudaFree(d_col_indices);
    cudaFree(d_values);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Result matrix C:
11 16 
4 8 
28 34 

