In [6]:
%%writefile cuda_streams.cu

#include <stdio.h>
#include <cuda.h>

#define N 1000000 // Array size

__global__ void squareKernel(int *arr, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        arr[idx] *= arr[idx];
    }
}

int main() {
    int *d_arr;
    cudaMalloc((void**)&d_arr, N * sizeof(int));

    int *h_arr = (int*)malloc(N * sizeof(int));
    for (int i = 0; i < N; i++) h_arr[i] = i + 1;

    cudaMemcpy(d_arr, h_arr, N * sizeof(int), cudaMemcpyHostToDevice);

    cudaStream_t stream1, stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);

    int half_N = N / 2;

    squareKernel<<<(half_N + 255) / 256, 256, 0, stream1>>>(d_arr, half_N);
    squareKernel<<<(half_N + 255) / 256, 256, 0, stream2>>>(d_arr + half_N, half_N);

    cudaStreamSynchronize(stream1);
    cudaStreamSynchronize(stream2);

    cudaMemcpy(h_arr, d_arr, N * sizeof(int), cudaMemcpyDeviceToHost);

    printf("First 10 squared values: ");
    for (int i = 0; i < 10; i++) {
        printf("%d ", h_arr[i]);
    }
    printf("\n");

    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    cudaFree(d_arr);
    free(h_arr);

    return 0;
}

Overwriting cuda_streams.cu


In [7]:
!nvcc -arch=sm_75 cuda_streams.cu -o cuda_streams
!./cuda_streams

First 10 squared values: 1 4 9 16 25 36 49 64 81 100 
