In [None]:
%%writefile kernel.cl
__kernel void vector_add(
    __global const float* A,
    __global const float* B,
    __global float* C
) {
    int id = get_global_id(0);
    C[id] = A[id] + B[id];
}


Overwriting kernel.cl


In [None]:
// Write the following C++ OpenCL program to a file named vector_add.cpp
%%writefile vector_add.cpp

// Specify the target OpenCL version (OpenCL 1.2)
#define CL_TARGET_OPENCL_VERSION 120

// Include the main OpenCL header
#include <CL/cl.h>

// Include standard C++ input/output stream library
#include <iostream>

// Include C++ vector container
#include <vector>

// Include file stream library for reading kernel source
#include <fstream>

// Include chrono library for timing measurements
#include <chrono>

// Function to load OpenCL kernel source code from a file
std::string loadKernel(const char* filename) {

    // Open the kernel source file
    std::ifstream file(filename);

    // Read the entire file into a string and return it
    return std::string(
        std::istreambuf_iterator<char>(file),   // Iterator to beginning of file
        std::istreambuf_iterator<char>()         // Iterator to end of file
    );
}

// Main program entry point
int main() {

    // Define number of elements (2^20)
    const int N = 1 << 20;

    // Compute total size in bytes for one vector
    size_t size = N * sizeof(float);

    // Create input vector A and initialize all elements to 1.0
    std::vector<float> A(N, 1.0f);

    // Create input vector B and initialize all elements to 2.0
    std::vector<float> B(N, 2.0f);

    // Create output vector C (uninitialized)
    std::vector<float> C(N);

    // Declare OpenCL platform identifier
    cl_platform_id platform;

    // Declare OpenCL device identifier
    cl_device_id device;

    // Get the first available OpenCL platform
    clGetPlatformIDs(1, &platform, nullptr);

    // Get the first available device from the platform
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr);

    // Create an OpenCL context for the selected device
    cl_context context = clCreateContext(
        nullptr,        // Default context properties
        1,              // Number of devices
        &device,        // Device list
        nullptr,        // No callback
        nullptr,        // No user data
        nullptr         // No error code return
    );

    // Create a command queue for the device
    cl_command_queue queue = clCreateCommandQueue(
        context,        // OpenCL context
        device,         // Target device
        0,              // Queue properties
        nullptr         // No error code return
    );

    // Create device buffer for vector A and copy host data to device
    cl_mem dA = clCreateBuffer(
        context,                                // OpenCL context
        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,// Read-only, copy from host
        size,                                   // Buffer size
        A.data(),                               // Host pointer
        nullptr                                 // No error code return
    );

    // Create device buffer for vector B and copy host data to device
    cl_mem dB = clCreateBuffer(
        context,                                // OpenCL context
        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,// Read-only, copy from host
        size,                                   // Buffer size
        B.data(),                               // Host pointer
        nullptr                                 // No error code return
    );

    // Create device buffer for vector C (output only)
    cl_mem dC = clCreateBuffer(
        context,            // OpenCL context
        CL_MEM_WRITE_ONLY,  // Write-only buffer
        size,               // Buffer size
        nullptr,            // No host pointer
        nullptr             // No error code return
    );

    // Load kernel source code from file "kernel.cl"
    std::string srcCode = loadKernel("kernel.cl");

    // Convert kernel source to C-style string
    const char* src = srcCode.c_str();

    // Create OpenCL program from kernel source
    cl_program program = clCreateProgramWithSource(
        context,    // OpenCL context
        1,          // Number of source strings
        &src,       // Pointer to source code
        nullptr,    // Source lengths (null-terminated)
        nullptr     // No error code return
    );

    // Build (compile) the OpenCL program for the device
    clBuildProgram(
        program,    // OpenCL program
        1,          // Number of devices
        &device,    // Device list
        nullptr,    // Compiler options
        nullptr,    // No callback
        nullptr     // No user data
    );

    // Create kernel object from the compiled program
    cl_kernel kernel = clCreateKernel(
        program,        // Compiled program
        "vector_add",   // Kernel function name
        nullptr         // No error code return
    );

    // Set kernel argument 0 (input vector A)
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &dA);

    // Set kernel argument 1 (input vector B)
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &dB);

    // Set kernel argument 2 (output vector C)
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &dC);

    // Define global work size (one work-item per element)
    size_t globalSize = N;

    // Record start time before kernel execution
    auto start = std::chrono::high_resolution_clock::now();

    // Enqueue kernel for execution
    clEnqueueNDRangeKernel(
        queue,         // Command queue
        kernel,        // Kernel to execute
        1,             // Number of dimensions
        nullptr,       // Global work offset
        &globalSize,   // Global work size
        nullptr,       // Local work size (let OpenCL decide)
        0,             // Number of events to wait for
        nullptr,       // Event wait list
        nullptr        // Event object
    );

    // Wait until kernel execution is complete
    clFinish(queue);

    // Record end time after kernel execution
    auto end = std::chrono::high_resolution_clock::now();

    // Read results from device buffer dC to host vector C
    clEnqueueReadBuffer(
        queue,         // Command queue
        dC,            // Device buffer
        CL_TRUE,       // Blocking read
        0,             // Offset
        size,          // Number of bytes to read
        C.data(),      // Host destination
        0,             // Number of events to wait for
        nullptr,       // Event wait list
        nullptr        // Event object
    );

    // Print kernel execution time in milliseconds
    std::cout << "Vector Add Time: "
              << std::chrono::duration<double, std::milli>(end - start).count()
              << " ms\n";

    // Exit program successfully
    return 0;
}


Overwriting vector_add.cpp


In [None]:
!g++ vector_add.cpp -lOpenCL -o vector_add
!./vector_add


Vector Add Time: 0.000639 ms


In [None]:
// Write the following OpenCL kernel to a file named matrix_mul.cl
%%writefile matrix_mul.cl

// Define an OpenCL kernel function for matrix multiplication
__kernel void matrix_mul(

    // Pointer to matrix A stored in global memory (read-only)
    __global const float* A,

    // Pointer to matrix B stored in global memory (read-only)
    __global const float* B,

    // Pointer to matrix C stored in global memory (write-only result)
    __global float* C,

    // Number of rows in matrix A and matrix C
    int N,

    // Number of columns in matrix A and rows in matrix B
    int M,

    // Number of columns in matrix B and matrix C
    int K
) {

    // Get the global row index for this work-item
    int row = get_global_id(0);

    // Get the global column index for this work-item
    int col = get_global_id(1);

    // Initialize accumulator for dot product
    float sum = 0.0f;

    // Loop over the shared dimension M
    for (int i = 0; i < M; i++)

        // Multiply corresponding elements and accumulate the result
        sum += A[row * M + i] * B[i * K + col];

    // Store the computed value in the result matrix C
    C[row * K + col] = sum;
}


Writing matrix_mul.cl


In [None]:
// Write the following C++ OpenCL host program to a file named matrix_mul.cpp
%%writefile matrix_mul.cpp

// Specify the target OpenCL version (1.2)
#define CL_TARGET_OPENCL_VERSION 120

// Include the main OpenCL header
#include <CL/cl.h>

// Include standard input/output stream library
#include <iostream>

// Include vector container from the C++ standard library
#include <vector>

// Include file stream library for reading kernel source
#include <fstream>

// Include chrono library for performance timing
#include <chrono> // for timing

// Function to load OpenCL kernel source code from a file
std::string loadKernel(const char* filename) {

    // Open the kernel file
    std::ifstream file(filename);

    // Read entire file into a string and return it
    return std::string(
        std::istreambuf_iterator<char>(file),
        std::istreambuf_iterator<char>()
    );
}

// Main program entry point
int main() {

    // Define matrix dimensions: A (N×M), B (M×K), C (N×K)
    const int N = 256, M = 256, K = 256;

    // Allocate and initialize matrix A with all elements equal to 1.0
    std::vector<float> A(N * M, 1.0f);

    // Allocate and initialize matrix B with all elements equal to 2.0
    std::vector<float> B(M * K, 2.0f);

    // Allocate matrix C to store the result
    std::vector<float> C(N * K);

    // Declare OpenCL platform identifier
    cl_platform_id platform;

    // Declare OpenCL device identifier
    cl_device_id device;

    // Get the first available OpenCL platform
    clGetPlatformIDs(1, &platform, nullptr);

    // Get the first available OpenCL device (CPU or GPU)
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr);

    // Create an OpenCL context for the selected device
    cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);

    // Create a command queue for issuing commands to the device
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, nullptr);

    // Create OpenCL buffer for matrix A and copy data from host to device
    cl_mem dA = clCreateBuffer(
        context,
        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        A.size() * sizeof(float),
        A.data(),
        nullptr
    );

    // Create OpenCL buffer for matrix B and copy data from host to device
    cl_mem dB = clCreateBuffer(
        context,
        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        B.size() * sizeof(float),
        B.data(),
        nullptr
    );

    // Create OpenCL buffer for matrix C (output only)
    cl_mem dC = clCreateBuffer(
        context,
        CL_MEM_WRITE_ONLY,
        C.size() * sizeof(float),
        nullptr,
        nullptr
    );

    // Load kernel source code from file
    std::string srcCode = loadKernel("matrix_mul.cl");

    // Convert kernel source code to C-style string
    const char* src = srcCode.c_str();

    // Create OpenCL program object from source code
    cl_program program = clCreateProgramWithSource(context, 1, &src, nullptr, nullptr);

    // Compile the OpenCL program for the selected device
    clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);

    // Create kernel object from the compiled program
    cl_kernel kernel = clCreateKernel(program, "matrix_mul", nullptr);

    // Set kernel argument 0: pointer to matrix A
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &dA);

    // Set kernel argument 1: pointer to matrix B
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &dB);

    // Set kernel argument 2: pointer to matrix C
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &dC);

    // Set kernel argument 3: number of rows N
    clSetKernelArg(kernel, 3, sizeof(int), &N);

    // Set kernel argument 4: shared dimension M
    clSetKernelArg(kernel, 4, sizeof(int), &M);

    // Set kernel argument 5: number of columns K
    clSetKernelArg(kernel, 5, sizeof(int), &K);

    // Define global work size (2D): one work-item per element of matrix C
    size_t globalSize[2] = { (size_t)N, (size_t)K };

    // ======== start timing ========

    // Record start time before kernel execution
    auto start = std::chrono::high_resolution_clock::now();

    // Enqueue the matrix multiplication kernel for execution
    clEnqueueNDRangeKernel(
        queue,
        kernel,
        2,
        nullptr,
        globalSize,
        nullptr,
        0,
        nullptr,
        nullptr
    );

    // Wait until kernel execution is finished
    clFinish(queue);

    // Record end time after kernel execution
    auto end = std::chrono::high_resolution_clock::now();

    // Compute elapsed time in milliseconds
    std::chrono::duration<double, std::milli> duration_ms = end - start;

    // Print kernel execution time
    std::cout << "Matrix multiplication execution time: "
              << duration_ms.count()
              << " ms\n";

    // Read the result matrix C from device to host memory
    clEnqueueReadBuffer(
        queue,
        dC,
        CL_TRUE,
        0,
        C.size() * sizeof(float),
        C.data(),
        0,
        nullptr,
        nullptr
    );

    // Print completion message
    std::cout << "Matrix multiplication finished\n";

    // Exit program successfully
    return 0;
}


Overwriting matrix_mul.cpp


In [None]:
!g++ matrix_mul.cpp -lOpenCL -o matrix_mul
!./matrix_mul


Matrix multiplication execution time: 0.00042 ms
Matrix multiplication finished


# Control Questions – OpenCL

### 1. What are the main types of memory used in OpenCL?

OpenCL uses several types of memory:

- **Global memory**  
  - Accessible by all work-items on the device.  
  - Large capacity but slow.  
  - Used to store large arrays of data.

- **Local memory**  
  - Shared memory for all work-items within a work-group.  
  - Fast, but limited in size.  
  - Ideal for communication between threads in a group.

- **Private memory**  
  - Memory private to each work-item.  
  - Typically implemented in registers.  
  - Very fast but very limited in size.

- **Constant memory**  
  - Read-only memory accessible by all work-items.  
  - Optimized for simultaneous reads by many threads.

---

### 2. How to configure global and local work sizes?

- **Global work size**  
  - Total number of work-items that will execute the kernel.  
  - Usually matches the total size of the dataset.

- **Local work size**  
  - Number of work-items in a single work-group.  
  - Local memory and synchronization are used within a work-group.  
  - Proper tuning affects performance:  
    - Too small → low GPU utilization  
    - Too large → may exceed local memory capacity

**Example:**
```c
size_t globalSize[2] = {N, K};
size_t localSize[2] = {16, 16}; // 16x16 threads per work-group
clEnqueueNDRangeKernel(queue, kernel, 2, nullptr, globalSize, localSize, 0, nullptr, nullptr);
```

3. How does OpenCL differ from CUDA?

OpenCL

Cross-platform standard for CPU, GPU, FPGA.

Works on devices from different vendors.

More portable but more complex to configure and optimize.

CUDA

Proprietary API from NVIDIA.

Works only on NVIDIA GPUs.

Offers simpler and highly optimized control over memory and threads on NVIDIA hardware.

4. What are the advantages of using OpenCL?

Code portability across different devices (CPU, GPU, FPGA).

Ability to utilize parallelism and accelerate computation on various architectures.

Fine-grained control over memory and threads for performance optimization.

Supports modern multithreaded computations and scientific workloads.


This is **ready to paste directly into your report**.  

If you want, I can also **add a small diagram showing memory types and work-group hierarchy** to make your lab report look more professional. Do you want me to do that?