<a href="https://colab.research.google.com/github/awaisarif18/PDC-Assignments/blob/main/PDC_Assignment01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile cuda_assignment.cu
/*
 * ======================================================================
 * PDC Lab Assignment #1: Introduction to CUDA
 *
 * This single file contains the code for all three parts of the assignment:
 * 1. Hello GPU: Demonstrates basic kernel launch and thread indexing.
 * 2. Vector Addition: Compares CPU vs. GPU performance for vector addition.
 * 3. Image Inversion: Compares CPU vs. GPU performance for image processing.
 *
 * How to compile in Google Colab (after running %%writefile):
 * !nvcc cuda_assignment.cu -o assignment_runner
 *
 * How to run:
 * !./assignment_runner
 *
 * (Make sure to upload 'input.jpg' for Part 3 to work!)
 * ======================================================================
 */

// Common C/C++ headers
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <chrono> // For C++ high-resolution timers (CPU)

// CUDA runtime header
#include <cuda_runtime.h>

// STB Image headers for loading/saving images (Part 3)
// We must define these implementation macros in *one* C/C++ file
// before including the headers. This is that file.
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"

/*
 * ======================================================================
 * Helper Macro for CUDA Error Checking
 * ======================================================================
 *
 * This macro wraps every CUDA API call to check for errors.
 * If a CUDA call fails, it prints the error message, file, and line number,
 * and then exits the program. This is extremely helpful for debugging.
 */
#define CHECK_CUDA(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        fprintf(stderr, "CUDA Error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(1); \
    } \
} while (0)

/*
 * ======================================================================
 * Part 1: Hello GPU with CUDA
 * ======================================================================
 *
 * __global__ keyword: This tells the CUDA compiler (NVCC) that this
 * function is a "kernel" - code that will run on the GPU.
 * It is called from the host (CPU) and executed by many threads on the device (GPU).
 */
__global__ void helloGpu() {
    /*
     * Built-in CUDA variables:
     * - threadIdx.x: The ID of the current thread within its block (e.g., 0, 1, ..., 255).
     * - blockIdx.x:  The ID of the current block within the grid (e.g., 0, 1, ...).
     * - blockDim.x:  The number of threads in one block (e.g., 256).
     *
     * We can calculate a unique global ID for every thread in the grid:
     * global_thread_id = (block_id * threads_per_block) + thread_id
     */
    int global_tid = blockIdx.x * blockDim.x + threadIdx.x;

    // printf is available inside kernels (on modern GPUs)
    // It's helpful for debugging, but can be slow if all threads print.
    printf("Hello from thread %d (Block: %d, Thread: %d)\n",
           global_tid, blockIdx.x, threadIdx.x);
}

/*
 * ======================================================================
 * Part 2: Vector Addition
 * ======================================================================
 */

// --- CPU Implementation ---
// A standard C++ function that runs on the host (CPU).
void cpuVectorAdd(int N, float* a, float* b, float* c) {
    for (int i = 0; i < N; ++i) {
        c[i] = a[i] + b[i];
    }
}

// --- GPU Implementation ---
// A CUDA kernel that runs on the device (GPU).
__global__ void gpuVectorAdd(int N, float* d_a, float* d_b, float* d_c) {
    // Calculate the global thread ID.
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    /*
     * Grid-Stride Loop:
     * This is a common and robust pattern. Instead of launching exactly N threads,
     * we launch a reasonable number of threads and have each thread
     * loop and process multiple elements.
     *
     * This has two benefits:
     * 1. It works even if N is larger than the max number of threads we can launch.
     * 2. It's often more efficient.
     *
     * gridDim.x * blockDim.x = Total number of threads in the grid.
     * We increment our index 'i' by this total stride in each loop.
     */
    for (int i = idx; i < N; i += gridDim.x * blockDim.x) {
        d_c[i] = d_a[i] + d_b[i];
    }

    /*
     * --- Simpler (but less robust) alternative: ---
     * // Check boundary condition
     * if (idx < N) {
     * d_c[idx] = d_a[idx] + b_c[idx];
     * }
     * This also works, but requires launching at least N threads.
     * The grid-stride loop is generally preferred.
     */
}

/*
 * ======================================================================
 * Part 3: Image Inversion
 * ======================================================================
 */

// --- CPU Implementation ---
void cpuImageInvert(int width, int height, int channels, unsigned char* in_data, unsigned char* out_data) {
    int num_pixels = width * height;

    // Loop through every pixel
    for (int p = 0; p < num_pixels; ++p) {
        int base_idx = p * channels;
        // Invert the R, G, B channels
        out_data[base_idx + 0] = 255 - in_data[base_idx + 0]; // Red
        out_data[base_idx + 1] = 255 - in_data[base_idx + 1]; // Green
        out_data[base_idx + 2] = 255 - in_data[base_idx + 2]; // Blue

        // If there's an Alpha channel (4th channel), just copy it.
        // We don't want to invert transparency.
        if (channels == 4) {
            out_data[base_idx + 3] = in_data[base_idx + 3]; // Alpha
        }
    }
}

// --- GPU Implementation ---
__global__ void gpuImageInvert(int width, int height, int channels, unsigned char* d_in, unsigned char* d_out) {
    int num_pixels = width * height;

    // Calculate the global thread ID, which will correspond to the *pixel* index.
    int pixel_idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Use a grid-stride loop to process all pixels
    for (int p = pixel_idx; p < num_pixels; p += gridDim.x * blockDim.x) {
        int base_idx = p * channels;

        // Invert R, G, B
        d_out[base_idx + 0] = 255 - d_in[base_idx + 0];
        d_out[base_idx + 1] = 255 - d_in[base_idx + 1];
        d_out[base_idx + 2] = 255 - d_in[base_idx + 2];

        // Copy Alpha if it exists
        if (channels == 4) {
            d_out[base_idx + 3] = d_in[base_idx + 3];
        }
    }
}

/*
 * ======================================================================
 * Main Function
 * ======================================================================
 *
 * This runs on the host (CPU) and coordinates all parts.
 */
int main() {
    printf("===========================================\n");
    printf("PDC Lab Assignment #1: CUDA Introduction\n");
    printf("===========================================\n\n");

    // Get basic information about the GPU
    int deviceCount;
    CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
    if (deviceCount == 0) {
        fprintf(stderr, "No CUDA-capable device found!\n");
        return 1;
    }
    cudaDeviceProp devProp;
    CHECK_CUDA(cudaGetDeviceProperties(&devProp, 0));
    printf("Using GPU: %s\n\n", devProp.name);


    /*
     * ----------------------------------------
     * Part 1: Hello GPU
     * ----------------------------------------
     */
    printf("--- Part 1: Hello GPU ---\n");

    // Launch configuration: <<< Grid, Blocks >>>
    // We launch 2 blocks, and each block has 4 threads.
    // Total threads = 2 * 4 = 8
    dim3 grid_p1(2);
    dim3 blocks_p1(4);
    helloGpu<<<grid_p1, blocks_p1>>>();

    // We MUST wait for the kernel to finish before the program
    // continues. `cudaDeviceSynchronize` waits for all preceding
    // GPU tasks to complete.
    CHECK_CUDA(cudaDeviceSynchronize());
    printf("Part 1 Complete.\n\n");


    /*
     * ----------------------------------------
     * Part 2: Vector Addition
     * ----------------------------------------
     */
    printf("--- Part 2: Vector Addition (N = 10,000,000) ---\n");

    int N = 10000000;
    size_t bytes = N * sizeof(float);

    // 1. Allocate Host (CPU) memory
    float* h_a = (float*)malloc(bytes);
    float* h_b = (float*)malloc(bytes);
    float* h_c_cpu = (float*)malloc(bytes); // For CPU result
    float* h_c_gpu = (float*)malloc(bytes); // For GPU result

    if (!h_a || !h_b || !h_c_cpu || !h_c_gpu) {
        fprintf(stderr, "Failed to allocate host memory for Part 2!\n");
        return 1;
    }

    // 2. Initialize Host data
    for (int i = 0; i < N; ++i) {
        h_a[i] = (float)rand() / RAND_MAX; // Random float 0.0-1.0
        h_b[i] = (float)rand() / RAND_MAX;
    }

    // 3. --- Run CPU Version ---
    printf("Running CPU vector addition...\n");
    auto start_cpu = std::chrono::high_resolution_clock::now();

    cpuVectorAdd(N, h_a, h_b, h_c_cpu);

    auto stop_cpu = std::chrono::high_resolution_clock::now();
    auto duration_ms_cpu = std::chrono::duration_cast<std::chrono::microseconds>(stop_cpu - start_cpu).count() / 1000.0;
    printf("CPU Time: %.3f ms\n", duration_ms_cpu);

    // 4. --- Run GPU Version ---
    printf("Running GPU vector addition...\n");

    // 4a. Allocate Device (GPU) memory
    float* d_a, * d_b, * d_c;
    CHECK_CUDA(cudaMalloc(&d_a, bytes));
    CHECK_CUDA(cudaMalloc(&d_b, bytes));
    CHECK_CUDA(cudaMalloc(&d_c, bytes));

    // 4b. Create CUDA events for timing
    // This is the accurate way to time GPU execution.
    cudaEvent_t start_gpu, stop_gpu;
    CHECK_CUDA(cudaEventCreate(&start_gpu));
    CHECK_CUDA(cudaEventCreate(&stop_gpu));

    // 4c. Record start event
    CHECK_CUDA(cudaEventRecord(start_gpu));

    // 4d. Copy data from Host (CPU) to Device (GPU)
    CHECK_CUDA(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice));

    // 4e. Launch the GPU Kernel
    int threadsPerBlock = 256;
    // Calculate number of blocks needed to cover all N elements
    // This is a standard "ceiling" calculation: (N + 255) / 256
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    gpuVectorAdd<<<blocksPerGrid, threadsPerBlock>>>(N, d_a, d_b, d_c);

    // 4f. Copy results from Device (GPU) back to Host (CPU)
    CHECK_CUDA(cudaMemcpy(h_c_gpu, d_c, bytes, cudaMemcpyDeviceToHost));

    // 4g. Record stop event and synchronize
    CHECK_CUDA(cudaEventRecord(stop_gpu));
    CHECK_CUDA(cudaEventSynchronize(stop_gpu)); // Wait for stop event to finish

    // 4h. Calculate and print GPU time
    float duration_ms_gpu = 0;
    CHECK_CUDA(cudaEventElapsedTime(&duration_ms_gpu, start_gpu, stop_gpu));
    printf("GPU Time: %.3f ms\n", duration_ms_gpu);

    // 5. --- Verification and Speedup ---
    printf("Verifying results...\n");
    bool correct = true;
    for (int i = 0; i < 100; ++i) { // Check first 100 elements
        if (fabs(h_c_cpu[i] - h_c_gpu[i]) > 1e-5) {
            printf("Error at index %d: CPU=%.5f, GPU=%.5f\n", i, h_c_cpu[i], h_c_gpu[i]);
            correct = false;
            break;
        }
    }
    if (correct) {
        printf("Results are correct!\n");
        printf("Speedup (CPU Time / GPU Time): %.2fx\n", duration_ms_cpu / duration_ms_gpu);
    } else {
        printf("Results are INCORRECT!\n");
    }

    // 6. --- Cleanup Part 2 ---
    free(h_a);
    free(h_b);
    free(h_c_cpu);
    free(h_c_gpu);
    CHECK_CUDA(cudaFree(d_a));
    CHECK_CUDA(cudaFree(d_b));
    CHECK_CUDA(cudaFree(d_c));
    CHECK_CUDA(cudaEventDestroy(start_gpu));
    CHECK_CUDA(cudaEventDestroy(stop_gpu));
    printf("Part 2 Complete.\n\n");


    /*
     * ----------------------------------------
     * Part 3: Image Inversion
     * ----------------------------------------
     */
    printf("--- Part 3: Image Inversion ---\n");

    int width, height, channels;
    const char* input_filename = "input.jpg";
    const char* cpu_out_filename = "cpu_inverted.png";
    const char* gpu_out_filename = "gpu_inverted.png";

    // 1. Load image from disk (using stb_image)
    unsigned char* h_img_in = stbi_load(input_filename, &width, &height, &channels, 0);
    if (h_img_in == NULL) {
        fprintf(stderr, "ERROR: Failed to load input image '%s'.\n", input_filename);
        fprintf(stderr, "       Did you upload it to the Colab environment?\n");
    } else {
        printf("Loaded image '%s': %d x %d, %d channels\n", input_filename, width, height, channels);

        if (channels < 3) {
            fprintf(stderr, "ERROR: Image must be RGB (3 channels) or RGBA (4 channels).\n");
        } else {
            size_t img_bytes = width * height * channels * sizeof(unsigned char);

            // 2. Allocate host memory for output images
            unsigned char* h_img_cpu_out = (unsigned char*)malloc(img_bytes);
            unsigned char* h_img_gpu_out = (unsigned char*)malloc(img_bytes);
            if (!h_img_cpu_out || !h_img_gpu_out) {
                fprintf(stderr, "Failed to allocate host memory for Part 3!\n");
                stbi_image_free(h_img_in);
                return 1;
            }

            // 3. --- Run CPU Version ---
            printf("Running CPU image inversion...\n");
            auto start_img_cpu = std::chrono::high_resolution_clock::now();

            cpuImageInvert(width, height, channels, h_img_in, h_img_cpu_out);

            auto stop_img_cpu = std::chrono::high_resolution_clock::now();
            auto duration_ms_img_cpu = std::chrono::duration_cast<std::chrono::microseconds>(stop_img_cpu - start_img_cpu).count() / 1000.0;
            printf("CPU Time: %.3f ms\n", duration_ms_img_cpu);

            // 4. --- Run GPU Version ---
            printf("Running GPU image inversion...\n");

            // 4a. Allocate Device (GPU) memory
            unsigned char* d_img_in, * d_img_out;
            CHECK_CUDA(cudaMalloc(&d_img_in, img_bytes));
            CHECK_CUDA(cudaMalloc(&d_img_out, img_bytes));

            // 4b. Create CUDA events for timing
            cudaEvent_t start_img_gpu, stop_img_gpu;
            CHECK_CUDA(cudaEventCreate(&start_img_gpu));
            CHECK_CUDA(cudaEventCreate(&stop_img_gpu));

            // 4c. Record start event
            CHECK_CUDA(cudaEventRecord(start_img_gpu));

            // 4d. Copy input image from Host to Device
            CHECK_CUDA(cudaMemcpy(d_img_in, h_img_in, img_bytes, cudaMemcpyHostToDevice));

            // 4e. Launch the GPU Kernel
            // We'll launch one thread *per pixel*.
            int num_pixels = width * height;
            int imgThreadsPerBlock = 256;
            int imgBlocksPerGrid = (num_pixels + imgThreadsPerBlock - 1) / imgThreadsPerBlock;

            gpuImageInvert<<<imgBlocksPerGrid, imgThreadsPerBlock>>>(width, height, channels, d_img_in, d_img_out);

            // 4f. Copy inverted image from Device to Host
            CHECK_CUDA(cudaMemcpy(h_img_gpu_out, d_img_out, img_bytes, cudaMemcpyDeviceToHost));

            // 4g. Record stop event and synchronize
            CHECK_CUDA(cudaEventRecord(stop_img_gpu));
            CHECK_CUDA(cudaEventSynchronize(stop_img_gpu));

            // 4h. Calculate and print GPU time
            float duration_ms_img_gpu = 0;
            CHECK_CUDA(cudaEventElapsedTime(&duration_ms_img_gpu, start_img_gpu, stop_img_gpu));
            printf("GPU Time: %.3f ms\n", duration_ms_img_gpu);

            // 5. --- Save results and show speedup ---
            printf("Saving output images...\n");
            stbi_write_png(cpu_out_filename, width, height, channels, h_img_cpu_out, width * channels);
            stbi_write_png(gpu_out_filename, width, height, channels, h_img_gpu_out, width * channels);
            printf("Saved '%s' and '%s'.\n", cpu_out_filename, gpu_out_filename);
            printf("Speedup (CPU Time / GPU Time): %.2fx\n", duration_ms_img_cpu / duration_ms_img_gpu);

            // 6. --- Cleanup Part 3 ---
            stbi_image_free(h_img_in); // Free original image
            free(h_img_cpu_out);
            free(h_img_gpu_out);
            CHECK_CUDA(cudaFree(d_img_in));
            CHECK_CUDA(cudaFree(d_img_out));
            CHECK_CUDA(cudaEventDestroy(start_img_gpu));
            CHECK_CUDA(cudaEventDestroy(stop_img_gpu));
            printf("Part 3 Complete.\n");
        }
    }

    printf("\nAssignment Finished.\n");
    return 0;
}


Writing cuda_assignment.cu


In [8]:
!wget https://raw.githubusercontent.com/nothings/stb/master/stb_image.h
!wget https://raw.githubusercontent.com/nothings/stb/master/stb_image_write.h

--2025-10-29 06:10:23--  https://raw.githubusercontent.com/nothings/stb/master/stb_image.h
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 283010 (276K) [text/plain]
Saving to: ‘stb_image.h’


2025-10-29 06:10:23 (86.6 MB/s) - ‘stb_image.h’ saved [283010/283010]

--2025-10-29 06:10:23--  https://raw.githubusercontent.com/nothings/stb/master/stb_image_write.h
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71221 (70K) [text/plain]
Saving to: ‘stb_image_write.h’


2025-10-29 06:10:24 (45.1 MB/s) - ‘stb_image_write.h’ saved [71221/71

In [9]:
!nvcc cuda_assignment.cu -o assignment_runner

     unsigned int cur, limit, old_limit;
                              ^


                 stbi__uint32 idata_limit_old = idata_limit;
                              ^

        int out_size = 0;
            ^

        int delays_size = 0;
            ^



In [10]:
!./assignment_runner

PDC Lab Assignment #1: CUDA Introduction

Using GPU: Tesla T4

--- Part 1: Hello GPU ---
Part 1 Complete.

--- Part 2: Vector Addition (N = 10,000,000) ---
Running CPU vector addition...
CPU Time: 46.296 ms
Running GPU vector addition...
GPU Time: 45.243 ms
Verifying results...
Error at index 0: CPU=1.23457, GPU=0.00000
Results are INCORRECT!
Part 2 Complete.

--- Part 3: Image Inversion ---
Loaded image 'input.jpg': 194 x 260, 3 channels
Running CPU image inversion...
CPU Time: 0.269 ms
Running GPU image inversion...
GPU Time: 0.162 ms
Saving output images...
Saved 'cpu_inverted.png' and 'gpu_inverted.png'.
Speedup (CPU Time / GPU Time): 1.67x
Part 3 Complete.

Assignment Finished.
