In [1]:
%%writefile cuda_malloc_managed.cu

#include <stdio.h>
#include <cuda.h>
#include <chrono>

#define N 1000000  // Array size

__global__ void squareKernel(int *arr, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        arr[idx] *= arr[idx];
    }
}

void usingUnifiedMemory() {
    int *d_arr;
    cudaMallocManaged(&d_arr, N * sizeof(int));
    for (int i = 0; i < N; i++) d_arr[i] = i + 1;

    auto start = std::chrono::high_resolution_clock::now();
    squareKernel<<<(N + 255) / 256, 256>>>(d_arr, N);
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();

    double duration = std::chrono::duration<double, std::milli>(end - start).count();
    printf("Unified Memory Time: %.3f ms\n", duration);

    cudaFree(d_arr);
}

void usingGlobalMemory() {
    int *h_arr, *d_arr;
    h_arr = (int*)malloc(N * sizeof(int));
    cudaMalloc((void**)&d_arr, N * sizeof(int));

    for (int i = 0; i < N; i++) h_arr[i] = i + 1;
    cudaMemcpy(d_arr, h_arr, N * sizeof(int), cudaMemcpyHostToDevice);

    auto start = std::chrono::high_resolution_clock::now();
    squareKernel<<<(N + 255) / 256, 256>>>(d_arr, N);
    cudaMemcpy(h_arr, d_arr, N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();

    double duration = std::chrono::duration<double, std::milli>(end - start).count();
    printf("Global Memory Time: %.3f ms\n", duration);

    free(h_arr);
    cudaFree(d_arr);
}

int main() {
    usingUnifiedMemory();
    usingGlobalMemory();
    return 0;
}

Writing cuda_malloc_managed.cu


In [4]:
!nvcc -arch=sm_75 cuda_malloc_managed.cu -o cuda_malloc_managed
!./cuda_malloc_managed

Unified Memory Time: 1.600 ms
Global Memory Time: 0.857 ms
