In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-xb_tfi3b
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-xb_tfi3b
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10743 sha256=3a570d3ab74e9051a615aa78d54722c78a69a0c83a1d407fccde14c37a30e96c
  Stored in directory: /tmp/pip-ephem-wheel-cache-t6o48f7q/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [3]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpe7vtn56b".


In [4]:
!nvidia-smi

Fri Oct 18 08:32:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [19]:
# Save the code into a .cu file
%%writefile blur_cuda.cu

#include "lodepng.h"
#include <cuda_runtime.h>
#include <stdio.h>

__global__ void boxBlurKernel(unsigned char* d_image, unsigned char* d_output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    // Check if the pixel is within bounds
    if (x < width && y < height) {
        int redSum = 0, greenSum = 0, blueSum = 0;
        int count = 0;

        // Iterate over 3x3 neighborhood
        for (int i = -1; i <= 1; ++i) {
            for (int j = -1; j <= 1; ++j) {
                int nx = x + i;
                int ny = y + j;

                // Check if neighbor coordinates are within bounds
                if (nx >= 0 && ny >= 0 && nx < width && ny < height) {
                    int idx = 4 * (ny * width + nx);
                    redSum += d_image[idx];
                    greenSum += d_image[idx + 1];
                    blueSum += d_image[idx + 2];
                    count++;
                }
            }
        }

        // Calculate the new blurred pixel value
        int outIdx = 4 * (y * width + x);
        d_output[outIdx] = redSum / count;
        d_output[outIdx + 1] = greenSum / count;
        d_output[outIdx + 2] = blueSum / count;
        d_output[outIdx + 3] = d_image[outIdx + 3];  // Copy alpha unchanged
    }
}

void checkCudaError(cudaError_t err, const char *msg) {
    if (err != cudaSuccess) {
        printf("CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
        exit(1);
    }
}

int main() {
    // Read the PNG file into an image array
    unsigned char* image;
    unsigned width, height;
    unsigned error = lodepng_decode32_file(&image, &width, &height, "input.png");

    if (error) {
        printf("Error decoding PNG: %s\n", lodepng_error_text(error));
        return 1;
    }

    // Allocate GPU memory
    unsigned char* d_image;
    unsigned char* d_output;
    size_t imageSize = width * height * 4;
    checkCudaError(cudaMalloc(&d_image, imageSize), "Allocating d_image");
    checkCudaError(cudaMalloc(&d_output, imageSize), "Allocating d_output");

    // Copy image data to GPU
    checkCudaError(cudaMemcpy(d_image, image, imageSize, cudaMemcpyHostToDevice), "Copying image to d_image");

    // Define block and grid sizes
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    // Apply the box blur kernel
    boxBlurKernel<<<gridSize, blockSize>>>(d_image, d_output, width, height);
    cudaDeviceSynchronize(); // Ensure the kernel has finished

    // Check for any errors during kernel execution
    checkCudaError(cudaGetLastError(), "Running boxBlurKernel");

    // Copy blurred image data back to the CPU
    checkCudaError(cudaMemcpy(image, d_output, imageSize, cudaMemcpyDeviceToHost), "Copying d_output to image");

    // Write the blurred image to a file
    error = lodepng_encode32_file("output_blur.png", image, width, height);
    if (error) {
        printf("Error encoding PNG: %s\n", lodepng_error_text(error));
    }

    // Free memory
    cudaFree(d_image);
    cudaFree(d_output);
    free(image);

    return 0;
}


Writing blur_cuda.cu


In [20]:
!nvcc -o blur_cuda blur_cuda.cu lodepng.cpp



In [21]:
!./blur_cuda
