<a href="https://colab.research.google.com/github/bhavik-mangla/CudaImageFiltering/blob/main/ImageFilteringUsingCuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-e5vxpgkd
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-e5vxpgkd
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 4664a4ef472c35ed55ab1a53c458aa48e6f9919d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.0-py3-none-any.whl size=9547 sha256=15f657626f5c8a90f3e88c756de6b1b0dae55f253bb213026a351a0a360edb8c
  Stored in directory: /tmp/pip-ephem-wheel-cache-k5vvpa2_/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bui

In [4]:
%load_ext nvcc4jupyter


Source files will be saved in "/tmp/tmpnhud14h8".


In [5]:

%%cuda --name testGoogleColab.cu

#include <stdio.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void add(int a, int b, int *c) {
   *c = a + b; }

int main() {

   // --- Host declarations and initializations
   int a, b, c;
   a = 2;
   b = 6;

   // --- Device allocations
   int *d_c; gpuErrchk(cudaMalloc(&d_c, sizeof(int)));

   // --- Kernel execution
   add<<<1,1>>>(a, b, d_c);
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());

   // --- Moving the results from device to host
   gpuErrchk(cudaMemcpy(&c, d_c, sizeof(int), cudaMemcpyDeviceToHost));

   // --- Results printout
   printf("%d + %d is %d\n", a, b, c);

   return 0; }

usage: colab_kernel_launcher.py [-h] [-t] [-p] [-l PROFILER] [-a PROFILER_ARGS] [-c COMPILER_ARGS]

%%cuda magic that compiles and runs CUDA C++ code in this cell. See
https://nvcc4jupyter.readthedocs.io/en/latest/magics.html#cuda for usage details.

options:
  -h, --help            show this help message and exit
  -t, --timeit
  -p, --profile
  -l PROFILER, --profiler PROFILER
  -a PROFILER_ARGS, --profiler-args PROFILER_ARGS
  -c COMPILER_ARGS, --compiler-args COMPILER_ARGS


usage: colab_kernel_launcher.py [-h] [-t] [-p] [-l PROFILER] [-a PROFILER_ARGS] [-c COMPILER_ARGS]
colab_kernel_launcher.py: error: unrecognized arguments: --name testGoogleColab.cu


In [4]:
%%writefile laplacian.cu
#include <stdio.h>
#include <opencv2/core/core.hpp> // Include core functionalities
#include <opencv2/imgcodecs.hpp> // For imread and imwrite
#include <opencv2/highgui/highgui.hpp> // For GUI functionalities, might not be necessary for this script

using namespace cv; // Use the cv namespace to simplify code

__global__ void boxFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height, int channel)
{
   int x = blockIdx.x*blockDim.x + threadIdx.x;
   int y = blockIdx.y*blockDim.y + threadIdx.y;

   // only threads inside image will write results
   if((x>=3/2) && (x<(width-3/2)) && (y>=3/2) && (y<(height-3/2)))
   {
      for(int c=0 ; c<channel ; c++)
      {
         // Sum of pixel values
         float sum = 0;
         // Number of filter pixels
         float kS = 0;
         // Loop inside the filter to average pixel values
         for(int ky=-3/2; ky<=3/2; ky++) {
            for(int kx=-3/2; kx<=3/2; kx++) {
               float fl = srcImage[((y+ky)*width + (x+kx))*channel+c];
               sum += fl;
               kS += 1;
            }
         }
         dstImage[(y*width+x)*channel+c] =  sum / kS;
      }
   }
}

__global__ void sharpeningFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height, int channel)
{
   int x = blockIdx.x*blockDim.x + threadIdx.x;
   int y = blockIdx.y*blockDim.y + threadIdx.y;

   float kernel[3][3] = {-1, -1, -1, -1, 9, -1, -1, -1, -1};
   // only threads inside image will write results
   if((x>=3/2) && (x<(width-3/2)) && (y>=3/2) && (y<(height-3/2)))
   {
      for(int c=0 ; c<channel ; c++)
      {
         // Sum of pixel values
         float sum = 0;
         // Loop inside the filter to average pixel values
         for(int ky=-3/2; ky<=3/2; ky++) {
            for(int kx=-3/2; kx<=3/2; kx++) {
               float fl = srcImage[((y+ky)*width + (x+kx))*channel+c];
               sum += fl*kernel[ky+3/2][kx+3/2];
            }
         }
         dstImage[(y*width+x)*channel+c] =  sum;
      }
   }
}

void checkCudaErrors(cudaError_t r) {
    if (r != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main() {
    Mat image = imread("images.jpeg", IMREAD_GRAYSCALE);
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    int channel=image.step/image.cols;
    size_t imageSize = width * height * sizeof(unsigned char);

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr) {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    dim3 blockSize(16, 16);
    dim3 gridSize(ceil(width/16.0),ceil(height/16.0));
    boxFilter<<<gridSize,blockSize>>>(d_inputImage,d_outputImage,width,height,channel);
    sharpeningFilter<<<gridSize,blockSize>>>(d_outputImage,d_inputImage,width,height,channel);
    checkCudaErrors(cudaMemcpy(h_outputImage, d_inputImage, imageSize, cudaMemcpyDeviceToHost));

    Mat outputImage(height, width, CV_8UC1, h_outputImage);
    imwrite("output.jpeg", outputImage);

    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    return 0;
}

Overwriting laplacian.cu


In [5]:
!nvcc -o laplacian laplacian.cu `pkg-config --cflags --libs opencv4`

In [6]:
!./laplacian

In [7]:
%%writefile sobelEdgeDetectionFilter.cu
#include <stdio.h>
#include <opencv2/core/core.hpp> // Include core functionalities
#include <opencv2/imgcodecs.hpp> // For imread and imwrite
#include <opencv2/highgui/highgui.hpp> // For GUI functionalities, might not be necessary for this script

using namespace cv; // Use the cv namespace to simplify code

__global__ void sobelFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{
   int x = blockIdx.x*blockDim.x + threadIdx.x;
   int y = blockIdx.y*blockDim.y + threadIdx.y;

   float Kx[3][3] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
   float Ky[3][3] = {1, 2, 1, 0, 0, 0, -1, -2, -1};

   // only threads inside image will write results
   if((x>=3/2) && (x<(width-3/2)) && (y>=3/2) && (y<(height-3/2)))
   {
         // Gradient in x-direction
         float Gx = 0;
         // Loop inside the filter to average pixel values
         for(int ky=-3/2; ky<=3/2; ky++) {
            for(int kx=-3/2; kx<=3/2; kx++) {
               float fl = srcImage[((y+ky)*width + (x+kx))];
               Gx += fl*Kx[ky+3/2][kx+3/2];
            }
         }
         float Gx_abs = Gx < 0 ? -Gx : Gx;

         // Gradient in y-direction
         float Gy = 0;
         // Loop inside the filter to average pixel values
         for(int ky=-3/2; ky<=3/2; ky++) {
            for(int kx=-3/2; kx<=3/2; kx++) {
               float fl = srcImage[((y+ky)*width + (x+kx))];
               Gy += fl*Ky[ky+3/2][kx+3/2];
            }
         }
         float Gy_abs = Gy < 0 ? -Gy : Gy;

         dstImage[(y*width+x)] =  Gx_abs + Gy_abs;
   }
}

void checkCudaErrors(cudaError_t r) {
    if (r != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main() {
    Mat image = imread("images.jpeg", IMREAD_GRAYSCALE);
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    size_t imageSize = width * height * sizeof(unsigned char);

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr) {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    dim3 blockSize(16, 16);
    dim3 gridSize(ceil(width/16.0),ceil(height/16.0));
    sobelFilter<<<gridSize,blockSize>>>(d_inputImage,d_outputImage,width,height);
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    Mat outputImage(height, width, CV_8UC1, h_outputImage);
    imwrite("output_sobel.jpeg", outputImage);

    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    return 0;
}


Writing sobelEdgeDetectionFilter.cu


In [8]:
!nvcc -o sobelEdgeDetectionFilter sobelEdgeDetectionFilter.cu `pkg-config --cflags --libs opencv4`

In [9]:
!./sobelEdgeDetectionFilter

In [19]:
%%writefile blur.cu
#include <stdio.h>
#include <opencv2/core/core.hpp> // Include core functionalities
#include <opencv2/imgcodecs.hpp> // For imread and imwrite
#include <opencv2/highgui/highgui.hpp> // For GUI functionalities, might not be necessary for this script

using namespace cv; // Use the cv namespace to simplify code

__global__ void boxFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height, int channel)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // Define a larger kernel size for increased blur intensity
    const int kernelSize = 5;
    const int halfKernelSize = kernelSize / 2;

    // only threads inside image will write results
    if ((x >= halfKernelSize) && (x < width - halfKernelSize) && (y >= halfKernelSize) && (y < height - halfKernelSize))
    {
        for (int c = 0; c < channel; c++)
        {
            // Sum of pixel values
            float sum = 0;
            // Number of filter pixels
            float kS = 0;
            // Loop inside the filter to average pixel values
            for (int ky = -halfKernelSize; ky <= halfKernelSize; ky++)
            {
                for (int kx = -halfKernelSize; kx <= halfKernelSize; kx++)
                {
                    float fl = srcImage[((y + ky) * width + (x + kx)) * channel + c];
                    sum += fl;
                    kS += 1;
                }
            }
            dstImage[(y * width + x) * channel + c] = sum / kS;
        }
    }
}


int main()
{
    Mat image = imread("images.jpeg", IMREAD_COLOR);
    if (image.empty())
    {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    int channel = image.channels();
    size_t imageSize = width * height * channel * sizeof(unsigned char);

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr)
    {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    unsigned char *d_inputImage, *d_outputImage;
    cudaMalloc(&d_inputImage, imageSize);
    cudaMalloc(&d_outputImage, imageSize);
    cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice);

    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
    boxFilter<<<gridSize, blockSize>>>(d_inputImage, d_outputImage, width, height, channel);
    cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost);

    Mat outputImage(height, width, CV_8UC3, h_outputImage);
    imwrite("output_blur.jpeg", outputImage);

    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    return 0;
}


Overwriting blur.cu


In [15]:
!nvcc -o blur blur.cu `pkg-config --cflags --libs opencv4`

In [16]:
!./blur

In [20]:
%%writefile mosaic.cu

#include <stdio.h>
#include <opencv2/core/core.hpp> // Include core functionalities
#include <opencv2/imgcodecs.hpp> // For imread and imwrite
#include <opencv2/highgui/highgui.hpp> // For GUI functionalities, might not be necessary for this script

#define MPR 32 // Mosaic pixel radius
#define MPR_SQUARE 1024 // Mosaic pixel radius square

using namespace cv; // Use the cv namespace to simplify code

__global__ void mosaic_filter_ccmpb(uchar3 *image, uchar3 *image_output, float3 *average, int cols, int rows, int c) {
    extern __shared__ float3 s_average[];

    // mapping the index to position
    int x = (threadIdx.x + blockIdx.x * MPR) * c;
    int y = (threadIdx.y + blockIdx.y * MPR) * c;
    int itx = threadIdx.x + threadIdx.y * MPR;

    int m_area;
    int mod_cols = cols % c;
    int mod_rows = rows % c;

    float3 m_average = make_float3(0, 0, 0);

    // calculation the size to deal with partial mosaic
    mod_cols = (y < cols - mod_cols) ? c : mod_cols;
    mod_rows = (x < rows - mod_rows) ? c : mod_rows;
    m_area = mod_rows * mod_cols;

    // using for loop to sum up the RGB to the register
    if (x < rows && y < cols) {
        for (int i = 0; i < c; ++i) {
            for (int j = 0; j < c; ++j) {
                int x_offset = x + i;
                int y_offset = y + j;
                int offset = x_offset + y_offset * rows;

                m_average.x += image[offset].x;
                m_average.y += image[offset].y;
                m_average.z += image[offset].z;
            }
        }
    }

    // do reduction with shared variable
    s_average[itx] = m_average;
    __syncthreads();

    for (int stride = MPR_SQUARE / 2; stride != 0; stride >>= 1) {
        if (itx < stride) {
            s_average[itx].x += s_average[itx + stride].x;
            s_average[itx].y += s_average[itx + stride].y;
            s_average[itx].z += s_average[itx + stride].z;
        }
        __syncthreads();
    }

    if (threadIdx.x == 0 && threadIdx.y == 0) {
        atomicAdd(&(average->x), s_average[itx].x);
        atomicAdd(&(average->y), s_average[itx].y);
        atomicAdd(&(average->z), s_average[itx].z);
    }

    if (x < rows && y < cols) {
        for (int i = 0; i < c; ++i) {
            for (int j = 0; j < c; ++j) {
                int x_offset = x + i;
                int y_offset = y + j;
                int offset = x_offset + y_offset * rows;

                image_output[offset].x = (unsigned char)(m_average.x / m_area);
                image_output[offset].y = (unsigned char)(m_average.y / m_area);
                image_output[offset].z = (unsigned char)(m_average.z / m_area);
            }
        }
    }
}

void checkCudaErrors(cudaError_t r) {
    if (r != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main() {
    // Load image
    Mat image = imread("images.jpeg", IMREAD_COLOR);
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    int channels = 3; // RGB channels

    // Calculate image size in bytes
    size_t imageSize = width * height * sizeof(uchar3);

    // Allocate memory on host and device
    uchar3 *h_inputImage, *h_outputImage;
    uchar3 *d_inputImage, *d_outputImage;
    float3 *d_average;
    checkCudaErrors(cudaMallocHost(&h_inputImage, imageSize));
    checkCudaErrors(cudaMallocHost(&h_outputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_average, sizeof(float3)));

    // Copy input image data to host memory
    memcpy(h_inputImage, image.data, imageSize);

    // Copy input image data to device memory
    checkCudaErrors(cudaMemcpy(d_inputImage, h_inputImage, imageSize, cudaMemcpyHostToDevice));

    // Initialize block and grid dimensions
    dim3 blockSize(MPR, MPR);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    // Invoke mosaic filter kernel
    mosaic_filter_ccmpb<<<gridSize, blockSize, MPR_SQUARE * sizeof(float3)>>>(d_inputImage, d_outputImage, d_average, width, height, channels);

    // Copy output image data from device to host
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    // Create output Mat object
    Mat outputImage(height, width, CV_8UC3, h_outputImage);

    // Write output image to file
    imwrite("output_mosaic.jpg", outputImage);

    // Free allocated memory
    cudaFreeHost(h_inputImage);
    cudaFreeHost(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);
    cudaFree(d_average);

    return 0;
}


Writing mosaic.cu


In [21]:
!nvcc -o mosaic mosaic.cu `pkg-config --cflags --libs opencv4`

In [22]:
!./mosaic

In [40]:
%%writefile sharpening.cu
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;

__global__ void sharpeningFilter(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height, int channel)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    float kernel[3][3] = {{-1, -1, -1}, {-1, 9, -1}, {-1, -1, -1}};

    // only threads inside image will write results
    if ((x >= 1) && (x < width - 1) && (y >= 1) && (y < height - 1))
    {
        for (int c = 0; c < channel; c++)
        {
            // Sum of pixel values
            float sum = 0;
            // Loop inside the filter to apply the kernel
            for (int ky = -1; ky <= 1; ky++)
            {
                for (int kx = -1; kx <= 1; kx++)
                {
                    float fl = srcImage[((y + ky) * width + (x + kx)) * channel + c];
                    sum += fl * kernel[ky + 1][kx + 1];
                }
            }
            dstImage[(y * width + x) * channel + c] = sum;
        }
    }
}

void checkCudaErrors(cudaError_t r)
{
    if (r != cudaSuccess)
    {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main()
{
    Mat image = imread("images.jpeg", IMREAD_GRAYSCALE);
    if (image.empty())
    {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    int channel = image.step / image.cols;
    size_t imageSize = width * height * sizeof(unsigned char);

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr)
    {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    dim3 blockSize(16, 16);
    dim3 gridSize(ceil(width / 16.0), ceil(height / 16.0));
    sharpeningFilter<<<gridSize, blockSize>>>(d_inputImage, d_outputImage, width, height, channel);
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    Mat outputImage(height, width, CV_8UC1, h_outputImage);
    imwrite("output_sharpened.jpeg", outputImage);

    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    return 0;
}



Overwriting sharpening.cu


In [35]:
!nvcc -o sharpening sharpening.cu `pkg-config --cflags --libs opencv4`

In [36]:
!./sharpening

In [44]:
%%writefile red.cu
#include <stdio.h>
#include <opencv2/core/core.hpp> // Include core functionalities
#include <opencv2/imgcodecs.hpp> // For imread and imwrite
#include <opencv2/highgui/highgui.hpp> // For GUI functionalities, might not be necessary for this script

using namespace cv; // Use the cv namespace to simplify code

__global__ void redChannelManipulation(unsigned char *srcImage, unsigned char *dstImage, unsigned int width, unsigned int height)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // only threads inside image will write results
    if ((x >= 0) && (x < width) && (y >= 0) && (y < height))
    {
        // Get the red channel value
        unsigned char red = srcImage[(y * width + x) * 3 + 2]; // BGR ordering: red channel is at index 2

        // Modify the red channel value (for example, increasing its intensity)
        // You can apply any desired manipulation here
        red = min(255, red * 2); // Example: doubling the intensity, capped at 255

        // Write the modified red channel value to the output image
        dstImage[(y * width + x) * 3 + 2] = red; // BGR ordering: red channel is at index 2
    }
}

void checkCudaErrors(cudaError_t r) {
    if (r != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(r));
        exit(EXIT_FAILURE);
    }
}

int main() {
    Mat image = imread("images.jpeg", IMREAD_COLOR); // Read image with color channels
    if (image.empty()) {
        printf("Error: Image not found.\n");
        return -1;
    }
    int width = image.cols;
    int height = image.rows;
    size_t imageSize = width * height * 3 * sizeof(unsigned char); // 3 channels (BGR)

    unsigned char *h_outputImage = (unsigned char *)malloc(imageSize);
    if (h_outputImage == nullptr) {
        fprintf(stderr, "Failed to allocate host memory\n");
        return -1;
    }

    unsigned char *d_inputImage, *d_outputImage;
    checkCudaErrors(cudaMalloc(&d_inputImage, imageSize));
    checkCudaErrors(cudaMalloc(&d_outputImage, imageSize));
    checkCudaErrors(cudaMemcpy(d_inputImage, image.data, imageSize, cudaMemcpyHostToDevice));

    dim3 blockSize(16, 16);
    dim3 gridSize(ceil(width / 16.0), ceil(height / 16.0));
    redChannelManipulation<<<gridSize, blockSize>>>(d_inputImage, d_outputImage, width, height);
    checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

    Mat outputImage(height, width, CV_8UC3, h_outputImage); // 3 channels (BGR)
    imwrite("output_red_modified.jpeg", outputImage);

    free(h_outputImage);
    cudaFree(d_inputImage);
    cudaFree(d_outputImage);

    return 0;
}


Overwriting red.cu


In [45]:
!nvcc -o red red.cu `pkg-config --cflags --libs opencv4`

In [46]:
!./red