In [None]:
// Write the following CUDA source code to a file named stack_cuda.cu
%%writefile stack_cuda.cu

// Include the CUDA runtime API for memory management, kernel launches, and atomics
#include <cuda_runtime.h>

// Include standard C++ input/output stream library
#include <iostream>

// Define the maximum number of elements that the stack can store
#define STACK_CAPACITY 1024

// Define the number of CUDA threads per kernel launch
#define THREADS 256

// ======================
// Stack structure definition
// ======================

// Define a structure that represents a stack stored in GPU global memory
struct Stack {
    int data[STACK_CAPACITY]; // Array that holds stack elements
    int top;                  // Index pointing to the next free position in the stack
};

// ======================
// Device function: push
// ======================

// Device function that pushes a value onto the stack
__device__ bool stack_push(Stack* s, int value) {

    // Atomically increment the stack pointer and return the old value
    int idx = atomicAdd(&s->top, 1);

    // Check if the stack capacity has been exceeded
    if (idx >= STACK_CAPACITY) {
        // Roll back the increment if the stack is full
        atomicSub(&s->top, 1);
        // Indicate that the push operation failed
        return false;
    }

    // Store the value at the computed stack index
    s->data[idx] = value;

    // Indicate that the push operation succeeded
    return true;
}

// ======================
// Device function: pop
// ======================

// Device function that pops a value from the stack
__device__ bool stack_pop(Stack* s, int* value) {

    // Atomically decrement the stack pointer and compute the target index
    int idx = atomicSub(&s->top, 1) - 1;

    // Check if the stack is empty
    if (idx < 0) {
        // Restore the stack pointer if underflow occurs
        atomicAdd(&s->top, 1);
        // Indicate that the pop operation failed
        return false;
    }

    // Read the value from the stack at the computed index
    *value = s->data[idx];

    // Indicate that the pop operation succeeded
    return true;
}

// ======================
// Kernel: parallel push
// ======================

// CUDA kernel where each thread attempts to push a value to the stack
__global__ void push_kernel(Stack* s) {

    // Compute the global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Each thread pushes its thread ID onto the stack
    stack_push(s, tid);
}

// ======================
// Kernel: parallel pop
// ======================

// CUDA kernel where each thread attempts to pop a value from the stack
__global__ void pop_kernel(Stack* s, int* output) {

    // Compute the global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Declare a variable to store the popped value
    int value;

    // Attempt to pop a value from the stack
    if (stack_pop(s, &value)) {
        // Store the popped value in the output array if successful
        output[tid] = value;
    } else {
        // Store -1 if the pop operation failed
        output[tid] = -1;
    }
}

// ======================
// Main function (host code)
// ======================

// Entry point of the host program
int main() {

    // Declare a pointer to the stack in device memory
    Stack* d_stack;

    // Declare a pointer to the output array in device memory
    int* d_output;

    // Allocate memory for the stack on the GPU
    cudaMalloc(&d_stack, sizeof(Stack));

    // Allocate memory for the output array on the GPU
    cudaMalloc(&d_output, THREADS * sizeof(int));

    // Declare a stack structure on the host
    Stack h_stack;

    // Initialize the stack pointer to zero (empty stack)
    h_stack.top = 0;

    // Copy the initialized stack from host memory to device memory
    cudaMemcpy(d_stack, &h_stack, sizeof(Stack), cudaMemcpyHostToDevice);

    // Launch the kernel that pushes values onto the stack
    push_kernel<<<1, THREADS>>>(d_stack);

    // Synchronize to ensure the push kernel has completed
    cudaDeviceSynchronize();

    // Launch the kernel that pops values from the stack
    pop_kernel<<<1, THREADS>>>(d_stack, d_output);

    // Synchronize to ensure the pop kernel has completed
    cudaDeviceSynchronize();

    // Declare an array on the host to store popped values
    int h_output[THREADS];

    // Copy the output data from device memory to host memory
    cudaMemcpy(h_output, d_output, THREADS * sizeof(int), cudaMemcpyDeviceToHost);

    // Initialize a counter for successful pop operations
    int success = 0;

    // Iterate through the output array
    for (int i = 0; i < THREADS; i++) {
        // Check if the pop operation was successful
        if (h_output[i] != -1)
            // Increment the success counter
            success++;
    }

    // Print the number of successful pop operations
    std::cout << "Successful pops: " << success << std::endl;

    // Print the maximum possible number of successful pops
    std::cout << "Expected (<= capacity): " << STACK_CAPACITY << std::endl;

    // Free the stack memory on the GPU
    cudaFree(d_stack);

    // Free the output array memory on the GPU
    cudaFree(d_output);

    // Return zero to indicate successful program execution
    return 0;
}


Overwriting stack_cuda.cu


In [None]:
!nvcc stack_cuda.cu -o stack

In [None]:
!./stack

Successful pops: 256
Expected (<= capacity): 1024


In [None]:
// Write the following CUDA source code to a file named queue_vs_stack.cu
%%writefile queue_vs_stack.cu

// Include CUDA runtime API for memory allocation, kernel launches, and atomic operations
#include <cuda_runtime.h>

// Include standard C++ input/output stream library
#include <iostream>

// Define a fixed maximum capacity for both stack and queue
#define CAPACITY 1024

// Define the number of CUDA threads per kernel launch
#define THREADS 256

// ======================
// STACK STRUCTURE
// ======================

// Define a stack data structure stored in GPU global memory
struct Stack {
    int data[CAPACITY];   // Array used to store stack elements
    int top;              // Index pointing to the top of the stack
};

// ======================
// QUEUE STRUCTURE
// ======================

// Define a queue data structure stored in GPU global memory
struct Queue {
    int data[CAPACITY];   // Array used to store queue elements
    int head;             // Index for dequeue operations
    int tail;             // Index for enqueue operations
    int size;             // Current number of elements in the queue
};

// ======================
// STACK PUSH (device)
// ======================

// Device function to push a value onto the stack
__device__ bool stack_push(Stack* s, int value) {

    // Atomically increment the stack pointer and get the old index
    int idx = atomicAdd(&s->top, 1);

    // Check if the stack exceeds its capacity
    if (idx >= CAPACITY) {
        // Roll back the increment if the stack is full
        atomicSub(&s->top, 1);
        // Indicate push failure
        return false;
    }

    // Store the value at the computed stack index
    s->data[idx] = value;
    // Indicate push success
    return true;
}

// ======================
// STACK POP (device)
// ======================

// Device function to pop a value from the stack
__device__ bool stack_pop(Stack* s, int* value) {

    // Atomically decrement the stack pointer and compute index
    int idx = atomicSub(&s->top, 1) - 1;

    // Check if the stack is empty
    if (idx < 0) {
        // Restore the stack pointer if underflow occurs
        atomicAdd(&s->top, 1);
        // Indicate pop failure
        return false;
    }

    // Load the value from the stack
    *value = s->data[idx];
    // Indicate pop success
    return true;
}

// ======================
// QUEUE ENQUEUE (device)
// ======================

// Device function to add a value to the queue
__device__ bool queue_enqueue(Queue* q, int value) {

    // Atomically increment the tail index to reserve a slot
    int pos = atomicAdd(&q->tail, 1);

    // Atomically increment the size and check if queue is full
    if (atomicAdd(&q->size, 1) >= CAPACITY) {
        // Roll back tail increment
        atomicSub(&q->tail, 1);
        // Roll back size increment
        atomicSub(&q->size, 1);
        // Indicate enqueue failure
        return false;
    }

    // Store the value using circular buffer indexing
    q->data[pos % CAPACITY] = value;
    // Indicate enqueue success
    return true;
}

// ======================
// QUEUE DEQUEUE (device)
// ======================

// Device function to remove a value from the queue
__device__ bool queue_dequeue(Queue* q, int* value) {

    // Atomically decrement the size and check if queue is empty
    if (atomicSub(&q->size, 1) <= 0) {
        // Restore size if queue was empty
        atomicAdd(&q->size, 1);
        // Indicate dequeue failure
        return false;
    }

    // Atomically increment the head index to reserve dequeue position
    int pos = atomicAdd(&q->head, 1);

    // Load the value using circular buffer indexing
    *value = q->data[pos % CAPACITY];
    // Indicate dequeue success
    return true;
}

// ======================
// STACK KERNELS
// ======================

// CUDA kernel for parallel stack push
__global__ void stack_push_kernel(Stack* s) {

    // Compute the global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Each thread pushes its thread ID onto the stack
    stack_push(s, tid);
}

// CUDA kernel for parallel stack pop
__global__ void stack_pop_kernel(Stack* s, int* out) {

    // Compute the global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Declare variable to store popped value
    int value;

    // Attempt to pop from the stack
    if (stack_pop(s, &value))
        // Store popped value if successful
        out[tid] = value;
    else
        // Store -1 if pop fails
        out[tid] = -1;
}

// ======================
// QUEUE KERNELS
// ======================

// CUDA kernel for parallel enqueue
__global__ void queue_enqueue_kernel(Queue* q) {

    // Compute the global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Each thread enqueues its thread ID
    queue_enqueue(q, tid);
}

// CUDA kernel for parallel dequeue
__global__ void queue_dequeue_kernel(Queue* q, int* out) {

    // Compute the global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Declare variable to store dequeued value
    int value;

    // Attempt to dequeue from the queue
    if (queue_dequeue(q, &value))
        // Store dequeued value if successful
        out[tid] = value;
    else
        // Store -1 if dequeue fails
        out[tid] = -1;
}

// ======================
// MAIN FUNCTION
// ======================

// Program entry point
int main() {

    // Declare device pointers for stack, queue, and output array
    Stack* d_stack;
    Queue* d_queue;
    int* d_output;

    // Allocate memory on the GPU for the stack
    cudaMalloc(&d_stack, sizeof(Stack));
    // Allocate memory on the GPU for the queue
    cudaMalloc(&d_queue, sizeof(Queue));
    // Allocate memory on the GPU for the output array
    cudaMalloc(&d_output, THREADS * sizeof(int));

    // Declare and initialize host-side stack
    Stack h_stack;
    h_stack.top = 0;

    // Declare and initialize host-side queue
    Queue h_queue;
    h_queue.head = 0;
    h_queue.tail = 0;
    h_queue.size = 0;

    // Copy initialized stack from host to device memory
    cudaMemcpy(d_stack, &h_stack, sizeof(Stack), cudaMemcpyHostToDevice);
    // Copy initialized queue from host to device memory
    cudaMemcpy(d_queue, &h_queue, sizeof(Queue), cudaMemcpyHostToDevice);

    // Declare CUDA events for timing measurements
    cudaEvent_t start, stop;
    // Create start event
    cudaEventCreate(&start);
    // Create stop event
    cudaEventCreate(&stop);

    // ======================
    // STACK TIMING
    // ======================

    // Record the start time
    cudaEventRecord(start);

    // Launch stack push kernel
    stack_push_kernel<<<1, THREADS>>>(d_stack);
    // Launch stack pop kernel
    stack_pop_kernel<<<1, THREADS>>>(d_stack, d_output);

    // Record the stop time
    cudaEventRecord(stop);
    // Synchronize to ensure kernels have finished
    cudaEventSynchronize(stop);

    // Variable to store stack execution time
    float stack_time;
    // Compute elapsed time for stack operations
    cudaEventElapsedTime(&stack_time, start, stop);

    // ======================
    // QUEUE TIMING
    // ======================

    // Record the start time
    cudaEventRecord(start);

    // Launch queue enqueue kernel
    queue_enqueue_kernel<<<1, THREADS>>>(d_queue);
    // Launch queue dequeue kernel
    queue_dequeue_kernel<<<1, THREADS>>>(d_queue, d_output);

    // Record the stop time
    cudaEventRecord(stop);
    // Synchronize to ensure kernels have finished
    cudaEventSynchronize(stop);

    // Variable to store queue execution time
    float queue_time;
    // Compute elapsed time for queue operations
    cudaEventElapsedTime(&queue_time, start, stop);

    // ======================
    // OUTPUT RESULTS
    // ======================

    // Print stack execution time
    std::cout << "Stack execution time: " << stack_time << " ms" << std::endl;
    // Print queue execution time
    std::cout << "Queue execution time: " << queue_time << " ms" << std::endl;

    // Free stack memory on the GPU
    cudaFree(d_stack);
    // Free queue memory on the GPU
    cudaFree(d_queue);
    // Free output array memory on the GPU
    cudaFree(d_output);

    // Return zero to indicate successful execution
    return 0;
}


Writing queue_vs_stack.cu


In [None]:
!nvcc queue_vs_stack.cu -o queue_vs_stack

In [None]:
!./queue_vs_stack

Stack execution time: 7.5271 ms
Queue execution time: 0.002048 ms


# Answers to Control Questions

## 1. What is the difference between a stack and a queue?

A stack is a data structure that follows the **LIFO (Last In, First Out)** principle, where the last inserted element is removed first. A queue follows the **FIFO (First In, First Out)** principle, where the first inserted element is removed first.  
Stacks use a single pointer (top), while queues require at least two pointers (head and tail), making queues more complex to implement in parallel environments.

---

## 2. What problems arise during parallel access to data?

Parallel access can lead to **race conditions**, where multiple threads read and write shared data simultaneously, producing incorrect results. Other issues include **data corruption**, **lost updates**, **inconsistent states**, and **non-deterministic behavior**, making debugging and correctness verification difficult.

---

## 3. How do atomic operations help avoid conflicts in parallel data structures?

Atomic operations ensure that read-modify-write sequences are executed as a single, indivisible operation. This prevents multiple threads from modifying the same memory location at the same time, eliminating race conditions and ensuring correctness when updating shared variables such as stack pointers or queue indices.

---

## 4. What CUDA memory types are used to store data?

CUDA provides several memory types:
- **Global memory**: Large and accessible by all threads, but has high latency.
- **Shared memory**: Fast memory shared among threads within the same block.
- **Local memory (registers)**: Private to each thread and very fast but limited in size.
- **Constant memory**: Read-only memory optimized for broadcast to many threads.
- **Texture memory**: Cached memory optimized for spatial access patterns.

---

## 5. How does thread synchronization affect performance?

Synchronization ensures correct execution order but introduces overhead. Excessive synchronization can serialize parallel execution, reduce occupancy, and lower overall performance. Efficient GPU programs minimize synchronization while still guaranteeing correctness.

---

## 6. Why is shared memory important for optimizing parallel data structures?

Shared memory has much lower latency than global memory and allows fast data exchange between threads in the same block. Using shared memory reduces global memory access, improves memory coalescing, and significantly increases the performance of parallel data structures such as stacks, queues, and reduction algorithms.
