In [None]:
%pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpft70cx48".


In [None]:
%%cuda
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>

#define N 10000000

// CUDA kernel to find max in an array
__global__ void findMaxGPU(int *arr, int *max, int size) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;                            // mengakses elemen di memori global
    int index = threadIdx.x;                                                    // mengakses elemen di memori shared

    // Validasi thread
    if (tid < size) {                                                           // Thread valid, salin data ke shared memory
        sdata[index] = arr[tid];
    } else {                                                                    // Thread tidak valid, isi dengan nilai INT_MIN
        sdata[index] = INT_MIN;
    }
    __syncthreads();                                                            //semua thread yang ada dalam satu blok selesai menulis ke shared memory sebelum ada thread yang melanjutkan ke baris kode berikutnya.

    //Mencari nilai max dengan cara memecah-mecah dan saling membandingkan antar thread
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (index < s && sdata[index] < sdata[index + s]) {
            sdata[index] = sdata[index + s];
        }
        __syncthreads();
    }

    //Membandingkan nilai max antar block
    if (index == 0) {
        atomicMax(max, sdata[0]);
    }
}

// Host function to find max on CPU
int findMaxCPU(int *arr, int size) {
    int max_val = arr[0];
    for (int i = 1; i < size; i++) {
        if (arr[i] > max_val) {
            max_val = arr[i];
        }
    }
    return max_val;
}

int main() {
    srand(42);

    int *h_arr = (int *)malloc(N * sizeof(int));                                // Mengalokasikan memori
    for (int i = 0; i < N; i++) {
        h_arr[i] = rand() % 1000;                                               // Random values between 0 and 999
    }

    int h_max_cpu = findMaxCPU(h_arr, N);

    int *d_arr, *d_max;
    cudaMalloc((void **)&d_arr, N * sizeof(int));
    cudaMalloc((void **)&d_max, sizeof(int));

    cudaMemcpy(d_arr, h_arr, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(d_max, INT_MIN, sizeof(int));

    int blockSize = 256;
    int gridSize = (N + blockSize - 1) / blockSize;

    // Start timing for GPU
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    findMaxGPU<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_max, N);
    cudaEventRecord(stop);

    int h_max_gpu;
    cudaMemcpy(&h_max_gpu, d_max, sizeof(int), cudaMemcpyDeviceToHost);

    // Calculate GPU computation time
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // CPU computation time
    clock_t cpu_start = clock();
    h_max_cpu = findMaxCPU(h_arr, N);
    clock_t cpu_end = clock();
    float cpu_time = 1000.0 * (cpu_end - cpu_start) / CLOCKS_PER_SEC;

    // Display results
    std::cout << "Hasil di CPU: " << h_max_cpu << std::endl;
    std::cout << "Hasil di GPU: " << h_max_gpu << std::endl;
    std::cout << "Waktu komputasi:" << std::endl;
    std::cout << "GPU Time: " << milliseconds << " ms" << std::endl;
    std::cout << "CPU Time: " << cpu_time << " ms" << std::endl;
    std::cout << "Speedup: " << cpu_time / milliseconds << "x" << std::endl;

    // Clean up
    cudaFree(d_arr);
    cudaFree(d_max);
    free(h_arr);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Hasil di CPU: 999
Hasil di GPU: 999
Waktu komputasi:
GPU Time: 0.926176 ms
CPU Time: 25.519 ms
Speedup: 27.5531x

