In [4]:
%%writefile kernel.cl
__kernel void vector_add(
    __global const float* A,
    __global const float* B,
    __global float* C
) {
    int id = get_global_id(0);
    C[id] = A[id] + B[id];
}


Overwriting kernel.cl


In [5]:
%%writefile vector_add.cpp
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <chrono>

std::string loadKernel(const char* filename) {
    std::ifstream file(filename);
    return std::string(
        std::istreambuf_iterator<char>(file),
        std::istreambuf_iterator<char>()
    );
}

int main() {
    const int N = 1 << 20;
    size_t size = N * sizeof(float);

    std::vector<float> A(N, 1.0f);
    std::vector<float> B(N, 2.0f);
    std::vector<float> C(N);

    cl_platform_id platform;
    cl_device_id device;
    clGetPlatformIDs(1, &platform, nullptr);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr);

    cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, nullptr);

    cl_mem dA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size, A.data(), nullptr);
    cl_mem dB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size, B.data(), nullptr);
    cl_mem dC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size, nullptr, nullptr);

    std::string srcCode = loadKernel("kernel.cl");
    const char* src = srcCode.c_str();
    cl_program program = clCreateProgramWithSource(context, 1, &src, nullptr, nullptr);
    clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);

    cl_kernel kernel = clCreateKernel(program, "vector_add", nullptr);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &dA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &dB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &dC);

    size_t globalSize = N;

    auto start = std::chrono::high_resolution_clock::now();
    clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, &globalSize, nullptr, 0, nullptr, nullptr);
    clFinish(queue);
    auto end = std::chrono::high_resolution_clock::now();

    clEnqueueReadBuffer(queue, dC, CL_TRUE, 0, size, C.data(), 0, nullptr, nullptr);

    std::cout << "Vector Add Time: "
              << std::chrono::duration<double, std::milli>(end - start).count()
              << " ms\n";

    return 0;
}


Overwriting vector_add.cpp


In [6]:
!g++ vector_add.cpp -lOpenCL -o vector_add
!./vector_add


Vector Add Time: 0.000639 ms


In [7]:
%%writefile matrix_mul.cl
__kernel void matrix_mul(
    __global const float* A,
    __global const float* B,
    __global float* C,
    int N, int M, int K
) {
    int row = get_global_id(0);
    int col = get_global_id(1);

    float sum = 0.0f;
    for (int i = 0; i < M; i++)
        sum += A[row * M + i] * B[i * K + col];

    C[row * K + col] = sum;
}


Writing matrix_mul.cl


In [10]:
%%writefile matrix_mul.cpp
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include <iostream>
#include <vector>
#include <fstream>
#include <chrono> // for timing

std::string loadKernel(const char* filename) {
    std::ifstream file(filename);
    return std::string(
        std::istreambuf_iterator<char>(file),
        std::istreambuf_iterator<char>()
    );
}

int main() {
    const int N = 256, M = 256, K = 256;

    std::vector<float> A(N * M, 1.0f);
    std::vector<float> B(M * K, 2.0f);
    std::vector<float> C(N * K);

    cl_platform_id platform;
    cl_device_id device;
    clGetPlatformIDs(1, &platform, nullptr);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr);

    cl_context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, nullptr);
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, nullptr);

    cl_mem dA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, A.size() * sizeof(float), A.data(), nullptr);
    cl_mem dB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, B.size() * sizeof(float), B.data(), nullptr);
    cl_mem dC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, C.size() * sizeof(float), nullptr, nullptr);

    std::string srcCode = loadKernel("matrix_mul.cl");
    const char* src = srcCode.c_str();
    cl_program program = clCreateProgramWithSource(context, 1, &src, nullptr, nullptr);
    clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);

    cl_kernel kernel = clCreateKernel(program, "matrix_mul", nullptr);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &dA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &dB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &dC);
    clSetKernelArg(kernel, 3, sizeof(int), &N);
    clSetKernelArg(kernel, 4, sizeof(int), &M);
    clSetKernelArg(kernel, 5, sizeof(int), &K);

    size_t globalSize[2] = { (size_t)N, (size_t)K };

    // ======== start timing ========
    auto start = std::chrono::high_resolution_clock::now();

    clEnqueueNDRangeKernel(queue, kernel, 2, nullptr, globalSize, nullptr, 0, nullptr, nullptr);
    clFinish(queue);

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration_ms = end - start;
    std::cout << "Matrix multiplication execution time: " << duration_ms.count() << " ms\n";

    clEnqueueReadBuffer(queue, dC, CL_TRUE, 0, C.size() * sizeof(float), C.data(), 0, nullptr, nullptr);

    std::cout << "Matrix multiplication finished\n";

    return 0;
}


Overwriting matrix_mul.cpp


In [11]:
!g++ matrix_mul.cpp -lOpenCL -o matrix_mul
!./matrix_mul


Matrix multiplication execution time: 0.00042 ms
Matrix multiplication finished


# Control Questions – OpenCL

### 1. What are the main types of memory used in OpenCL?

OpenCL uses several types of memory:

- **Global memory**  
  - Accessible by all work-items on the device.  
  - Large capacity but slow.  
  - Used to store large arrays of data.

- **Local memory**  
  - Shared memory for all work-items within a work-group.  
  - Fast, but limited in size.  
  - Ideal for communication between threads in a group.

- **Private memory**  
  - Memory private to each work-item.  
  - Typically implemented in registers.  
  - Very fast but very limited in size.

- **Constant memory**  
  - Read-only memory accessible by all work-items.  
  - Optimized for simultaneous reads by many threads.

---

### 2. How to configure global and local work sizes?

- **Global work size**  
  - Total number of work-items that will execute the kernel.  
  - Usually matches the total size of the dataset.

- **Local work size**  
  - Number of work-items in a single work-group.  
  - Local memory and synchronization are used within a work-group.  
  - Proper tuning affects performance:  
    - Too small → low GPU utilization  
    - Too large → may exceed local memory capacity

**Example:**
```c
size_t globalSize[2] = {N, K};
size_t localSize[2] = {16, 16}; // 16x16 threads per work-group
clEnqueueNDRangeKernel(queue, kernel, 2, nullptr, globalSize, localSize, 0, nullptr, nullptr);
```

3. How does OpenCL differ from CUDA?

OpenCL

Cross-platform standard for CPU, GPU, FPGA.

Works on devices from different vendors.

More portable but more complex to configure and optimize.

CUDA

Proprietary API from NVIDIA.

Works only on NVIDIA GPUs.

Offers simpler and highly optimized control over memory and threads on NVIDIA hardware.

4. What are the advantages of using OpenCL?

Code portability across different devices (CPU, GPU, FPGA).

Ability to utilize parallelism and accelerate computation on various architectures.

Fine-grained control over memory and threads for performance optimization.

Supports modern multithreaded computations and scientific workloads.


This is **ready to paste directly into your report**.  

If you want, I can also **add a small diagram showing memory types and work-group hierarchy** to make your lab report look more professional. Do you want me to do that?