<a href="https://colab.research.google.com/github/Yamna-Shabbir/pdcLab1/blob/main/code3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#include <iostream>
#include <opencv2/opencv.hpp>
#include <chrono>
#include <cuda_runtime.h>

using namespace cv;

// -------------------- CPU Pixel Inversion --------------------
void invertImageCPU(const uchar* input, uchar* output, int size) {
    for (int i = 0; i < size; ++i) {
        output[i] = 255 - input[i];
    }
}

// -------------------- CUDA Kernel --------------------
__global__ void invertImageGPU(const uchar* input, uchar* output, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < size) {
        output[idx] = 255 - input[idx];
    }
}

int main() {
    // -------------------- Load Image --------------------
    Mat img = imread("input.jpg", IMREAD_GRAYSCALE);
    if (img.empty()) {
        std::cerr << "Failed to load image!\n";
        return -1;
    }

    int imgSize = img.rows * img.cols;
    uchar* inputData = img.data;

    // -------------------- Allocate CPU Output Buffers --------------------
    Mat cpuResult(img.rows, img.cols, CV_8UC1);
    Mat gpuResult(img.rows, img.cols, CV_8UC1);

    // -------------------- CPU Inversion --------------------
    auto start_cpu = std::chrono::high_resolution_clock::now();
    invertImageCPU(inputData, cpuResult.data, imgSize);
    auto end_cpu = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> cpu_duration = end_cpu - start_cpu;
    std::cout << "CPU Time: " << cpu_duration.count() << " ms\n";

    // -------------------- Allocate GPU Memory --------------------
    uchar *d_input, *d_output;
    cudaMalloc(&d_input, imgSize * sizeof(uchar));
    cudaMalloc(&d_output, imgSize * sizeof(uchar));

    // Copy input image to device
    cudaMemcpy(d_input, inputData, imgSize * sizeof(uchar), cudaMemcpyHostToDevice);

    // -------------------- GPU Inversion --------------------
    cudaEvent_t start_gpu, stop_gpu;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);
    cudaEventRecord(start_gpu);

    int threadsPerBlock = 256;
    int blocks = (imgSize + threadsPerBlock - 1) / threadsPerBlock;
    invertImageGPU<<<blocks, threadsPerBlock>>>(d_input, d_output, imgSize);

    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);

    float gpu_time_ms = 0;
    cudaEventElapsedTime(&gpu_time_ms, start_gpu, stop_gpu);
    std::cout << "GPU Time: " << gpu_time_ms << " ms\n";

    // Copy result back to host
    cudaMemcpy(gpuResult.data, d_output, imgSize * sizeof(uchar), cudaMemcpyDeviceToHost);

    // -------------------- Verify Output --------------------
    bool match = true;
    for (int i = 0; i < imgSize; ++i) {
        if (abs(cpuResult.data[i] - gpuResult.data[i]) > 1) {
            match = false;
            std::cout << "Mismatch at index " << i << ": CPU=" << (int)cpuResult.data[i]
                      << " GPU=" << (int)gpuResult.data[i] << std::endl;
            break;
        }
    }

    std::cout << (match ? "Images match.\n" : "Images do not match!\n");

    // -------------------- Write Output Images --------------------
    imwrite("inverted_cpu.jpg", cpuResult);
    imwrite("inverted_gpu.jpg", gpuResult);

    // -------------------- Speedup --------------------
    double speedup = cpu_duration.count() / gpu_time_ms;
    std::cout << "Speedup = " << speedup << "x\n";

    // -------------------- Cleanup --------------------
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}
