In [4]:
%%writefile stack_cuda.cu

// Include CUDA runtime API (memory management, kernel launch, atomics)
#include <cuda_runtime.h>

// Include standard C++ input/output stream
#include <iostream>

// Maximum number of elements the stack can hold
#define STACK_CAPACITY 1024

// Number of CUDA threads used in kernels
#define THREADS 256

// ======================
// Stack structure definition
// ======================

// Structure representing a stack in GPU global memory
struct Stack {
    int data[STACK_CAPACITY]; // Fixed-size array storing stack elements
    int top;                  // Index of the next free position (stack pointer)
};

// ======================
// Device function: push
// ======================

// Pushes a value onto the stack
// Returns true if successful, false if stack is full
__device__ bool stack_push(Stack* s, int value) {

    // Atomically increment stack pointer and get old value
    int idx = atomicAdd(&s->top, 1);

    // Check for stack overflow
    if (idx >= STACK_CAPACITY) {
        // Roll back the increment if overflow occurred
        atomicSub(&s->top, 1);
        return false;
    }

    // Store value at the allocated stack position
    s->data[idx] = value;

    return true;
}

// ======================
// Device function: pop
// ======================

// Pops a value from the stack
// Stores popped value in *value
// Returns true if successful, false if stack is empty
__device__ bool stack_pop(Stack* s, int* value) {

    // Atomically decrement stack pointer and get previous index
    int idx = atomicSub(&s->top, 1) - 1;

    // Check for stack underflow
    if (idx < 0) {
        // Roll back decrement if stack was empty
        atomicAdd(&s->top, 1);
        return false;
    }

    // Retrieve value from stack
    *value = s->data[idx];

    return true;
}

// ======================
// Kernel: parallel push
// ======================

// CUDA kernel where multiple threads push values concurrently
__global__ void push_kernel(Stack* s) {

    // Compute global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Each thread attempts to push its thread ID
    stack_push(s, tid);
}

// ======================
// Kernel: parallel pop
// ======================

// CUDA kernel where multiple threads pop values concurrently
__global__ void pop_kernel(Stack* s, int* output) {

    // Compute global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int value;

    // Attempt to pop from stack
    if (stack_pop(s, &value)) {
        // Store popped value if successful
        output[tid] = value;
    } else {
        // Mark failed pop (stack empty)
        output[tid] = -1;
    }
}

// ======================
// Main function (host code)
// ======================

int main() {

    // Pointer to stack allocated in GPU memory
    Stack* d_stack;

    // Pointer to output array in GPU memory
    int* d_output;

    // Allocate memory for stack on GPU
    cudaMalloc(&d_stack, sizeof(Stack));

    // Allocate memory for output array on GPU
    cudaMalloc(&d_output, THREADS * sizeof(int));

    // Host-side stack structure
    Stack h_stack;

    // Initialize stack pointer to zero (empty stack)
    h_stack.top = 0;

    // Copy initialized stack from host to device
    cudaMemcpy(d_stack, &h_stack, sizeof(Stack), cudaMemcpyHostToDevice);

    // Launch kernel to push values in parallel
    push_kernel<<<1, THREADS>>>(d_stack);

    // Wait until push kernel finishes
    cudaDeviceSynchronize();

    // Launch kernel to pop values in parallel
    pop_kernel<<<1, THREADS>>>(d_stack, d_output);

    // Wait until pop kernel finishes
    cudaDeviceSynchronize();

    // Host-side array to store popped values
    int h_output[THREADS];

    // Copy popped values from device to host
    cudaMemcpy(h_output, d_output, THREADS * sizeof(int), cudaMemcpyDeviceToHost);

    // Counter for successful pop operations
    int success = 0;

    // Check correctness of pops
    for (int i = 0; i < THREADS; i++) {
        if (h_output[i] != -1)
            success++;
    }

    // Print results
    std::cout << "Successful pops: " << success << std::endl;
    std::cout << "Expected (<= capacity): " << STACK_CAPACITY << std::endl;

    // Free GPU memory
    cudaFree(d_stack);
    cudaFree(d_output);

    return 0;
}

Overwriting stack_cuda.cu


In [5]:
!nvcc stack_cuda.cu -o stack

In [6]:
!./stack

Successful pops: 256
Expected (<= capacity): 1024


In [7]:
%%writefile queue_vs_stack.cu

// CUDA runtime API (memory management, kernel launches, atomics)
#include <cuda_runtime.h>

// Standard C++ input/output
#include <iostream>

// Fixed capacity for both stack and queue
#define CAPACITY 1024

// Number of CUDA threads
#define THREADS 256

// ======================
// STACK STRUCTURE
// ======================

// Stack stored in global memory
struct Stack {
    int data[CAPACITY];   // Stack storage
    int top;              // Stack pointer
};

// ======================
// QUEUE STRUCTURE
// ======================

// Queue stored in global memory (circular buffer)
struct Queue {
    int data[CAPACITY];   // Queue storage
    int head;             // Index for dequeue
    int tail;             // Index for enqueue
    int size;             // Current number of elements
};

// ======================
// STACK PUSH (device)
// ======================

// Push value onto stack
__device__ bool stack_push(Stack* s, int value) {

    // Atomically increment stack pointer
    int idx = atomicAdd(&s->top, 1);

    // Check overflow
    if (idx >= CAPACITY) {
        // Roll back if full
        atomicSub(&s->top, 1);
        return false;
    }

    // Store value
    s->data[idx] = value;
    return true;
}

// ======================
// STACK POP (device)
// ======================

// Pop value from stack
__device__ bool stack_pop(Stack* s, int* value) {

    // Atomically decrement stack pointer
    int idx = atomicSub(&s->top, 1) - 1;

    // Check underflow
    if (idx < 0) {
        // Roll back if empty
        atomicAdd(&s->top, 1);
        return false;
    }

    // Load value
    *value = s->data[idx];
    return true;
}

// ======================
// QUEUE ENQUEUE (device)
// ======================

// Add value to queue
__device__ bool queue_enqueue(Queue* q, int value) {

    // Atomically reserve a position
    int pos = atomicAdd(&q->tail, 1);

    // Check if queue is full
    if (atomicAdd(&q->size, 1) >= CAPACITY) {
        // Roll back changes
        atomicSub(&q->tail, 1);
        atomicSub(&q->size, 1);
        return false;
    }

    // Store value using circular indexing
    q->data[pos % CAPACITY] = value;
    return true;
}

// ======================
// QUEUE DEQUEUE (device)
// ======================

// Remove value from queue
__device__ bool queue_dequeue(Queue* q, int* value) {

    // Check if queue is empty
    if (atomicSub(&q->size, 1) <= 0) {
        // Roll back if empty
        atomicAdd(&q->size, 1);
        return false;
    }

    // Atomically reserve dequeue position
    int pos = atomicAdd(&q->head, 1);

    // Load value using circular indexing
    *value = q->data[pos % CAPACITY];
    return true;
}

// ======================
// STACK KERNELS
// ======================

// Parallel stack push
__global__ void stack_push_kernel(Stack* s) {

    // Global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Each thread pushes its ID
    stack_push(s, tid);
}

// Parallel stack pop
__global__ void stack_pop_kernel(Stack* s, int* out) {

    // Global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int value;

    // Attempt pop
    if (stack_pop(s, &value))
        out[tid] = value;
    else
        out[tid] = -1;
}

// ======================
// QUEUE KERNELS
// ======================

// Parallel enqueue
__global__ void queue_enqueue_kernel(Queue* q) {

    // Global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    // Each thread enqueues its ID
    queue_enqueue(q, tid);
}

// Parallel dequeue
__global__ void queue_dequeue_kernel(Queue* q, int* out) {

    // Global thread index
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int value;

    // Attempt dequeue
    if (queue_dequeue(q, &value))
        out[tid] = value;
    else
        out[tid] = -1;
}

// ======================
// MAIN FUNCTION
// ======================

int main() {

    // Device pointers
    Stack* d_stack;
    Queue* d_queue;
    int* d_output;

    // Allocate GPU memory
    cudaMalloc(&d_stack, sizeof(Stack));
    cudaMalloc(&d_queue, sizeof(Queue));
    cudaMalloc(&d_output, THREADS * sizeof(int));

    // Host stack initialization
    Stack h_stack;
    h_stack.top = 0;

    // Host queue initialization
    Queue h_queue;
    h_queue.head = 0;
    h_queue.tail = 0;
    h_queue.size = 0;

    // Copy to device
    cudaMemcpy(d_stack, &h_stack, sizeof(Stack), cudaMemcpyHostToDevice);
    cudaMemcpy(d_queue, &h_queue, sizeof(Queue), cudaMemcpyHostToDevice);

    // CUDA events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // ======================
    // STACK TIMING
    // ======================

    cudaEventRecord(start);

    stack_push_kernel<<<1, THREADS>>>(d_stack);
    stack_pop_kernel<<<1, THREADS>>>(d_stack, d_output);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float stack_time;
    cudaEventElapsedTime(&stack_time, start, stop);

    // ======================
    // QUEUE TIMING
    // ======================

    cudaEventRecord(start);

    queue_enqueue_kernel<<<1, THREADS>>>(d_queue);
    queue_dequeue_kernel<<<1, THREADS>>>(d_queue, d_output);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float queue_time;
    cudaEventElapsedTime(&queue_time, start, stop);

    // ======================
    // OUTPUT RESULTS
    // ======================

    std::cout << "Stack execution time: " << stack_time << " ms" << std::endl;
    std::cout << "Queue execution time: " << queue_time << " ms" << std::endl;

    // Free GPU memory
    cudaFree(d_stack);
    cudaFree(d_queue);
    cudaFree(d_output);

    return 0;
}

Writing queue_vs_stack.cu


In [8]:
!nvcc queue_vs_stack.cu -o queue_vs_stack

In [9]:
!./queue_vs_stack

Stack execution time: 7.5271 ms
Queue execution time: 0.002048 ms


# Answers to Control Questions

## 1. What is the difference between a stack and a queue?

A stack is a data structure that follows the **LIFO (Last In, First Out)** principle, where the last inserted element is removed first. A queue follows the **FIFO (First In, First Out)** principle, where the first inserted element is removed first.  
Stacks use a single pointer (top), while queues require at least two pointers (head and tail), making queues more complex to implement in parallel environments.

---

## 2. What problems arise during parallel access to data?

Parallel access can lead to **race conditions**, where multiple threads read and write shared data simultaneously, producing incorrect results. Other issues include **data corruption**, **lost updates**, **inconsistent states**, and **non-deterministic behavior**, making debugging and correctness verification difficult.

---

## 3. How do atomic operations help avoid conflicts in parallel data structures?

Atomic operations ensure that read-modify-write sequences are executed as a single, indivisible operation. This prevents multiple threads from modifying the same memory location at the same time, eliminating race conditions and ensuring correctness when updating shared variables such as stack pointers or queue indices.

---

## 4. What CUDA memory types are used to store data?

CUDA provides several memory types:
- **Global memory**: Large and accessible by all threads, but has high latency.
- **Shared memory**: Fast memory shared among threads within the same block.
- **Local memory (registers)**: Private to each thread and very fast but limited in size.
- **Constant memory**: Read-only memory optimized for broadcast to many threads.
- **Texture memory**: Cached memory optimized for spatial access patterns.

---

## 5. How does thread synchronization affect performance?

Synchronization ensures correct execution order but introduces overhead. Excessive synchronization can serialize parallel execution, reduce occupancy, and lower overall performance. Efficient GPU programs minimize synchronization while still guaranteeing correctness.

---

## 6. Why is shared memory important for optimizing parallel data structures?

Shared memory has much lower latency than global memory and allows fast data exchange between threads in the same block. Using shared memory reduces global memory access, improves memory coalescing, and significantly increases the performance of parallel data structures such as stacks, queues, and reduction algorithms.
