In [8]:
%%writefile warp_divergence.cu

#include <stdio.h>
#include <cuda.h>

#define N 1024  // Number of threads per block (Multiple warps)

__global__ void warpDivergentKernel(int *arr) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Warp Divergence due to if-else
    if (tid % 2 == 0) {
        arr[tid] = tid * tid; // Squaring for even threads
    } else {
        arr[tid] = tid * tid * tid; // Cubing for odd threads
    }
}

__global__ void optimizedKernel(int *arr) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int isOdd = tid % 2;  // 1 if odd, 0 if even

    // Optimized version avoiding warp divergence
    arr[tid] = tid * tid * (1 + isOdd * (tid - 1));
}

int main() {
    int *d_arr;
    int size = N * sizeof(int);

    // Allocate memory on the device
    cudaMalloc((void**)&d_arr, size);

    // CUDA event creation for timing
    cudaEvent_t start, stop;
    float timeDivergent, timeOptimized;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Set up kernel launch configuration
    int threadsPerBlock = 256;
    int numBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Measure time for Warp-Divergent Kernel
    cudaEventRecord(start);
    warpDivergentKernel<<<numBlocks, threadsPerBlock>>>(d_arr);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timeDivergent, start, stop);

    // Measure time for Optimized Kernel
    cudaEventRecord(start);
    optimizedKernel<<<numBlocks, threadsPerBlock>>>(d_arr);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timeOptimized, start, stop);

    // Print execution times
    printf("Execution Time (Warp Divergence) : %.6f ms\n", timeDivergent);
    printf("Execution Time (Optimized)      : %.6f ms\n", timeOptimized);
    printf("Speedup Achieved: %.2fx\n", timeDivergent / timeOptimized);

    // Cleanup
    cudaFree(d_arr);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Writing warp_divergence.cu


In [10]:
!nvcc -arch=sm_75 warp_divergence.cu -o warp_divergence
!./warp_divergence

Execution Time (Warp Divergence) : 0.143424 ms
Execution Time (Optimized)      : 0.019744 ms
Speedup Achieved: 7.26x
