In [1]:
#ASSIGNMENT 4 Q1
%%writefile sum.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 1024


__global__ void sumKernel(int *input, int *output) {
    int tid = threadIdx.x;
    if (tid == 0) {
        // Task A: Iterative Sum
        int sum = 0;
        for (int i = 0; i < N; i++) {
            sum += input[i];
        }
        output[0] = sum;
    }
    else if (tid == 1) {

        output[1] = (N * (N - 1)) / 2;
    }
}

int main() {
    int h_input[N], h_output[2] = {0};

    for (int i = 0; i < N; i++) {
        h_input[i] = i;
    }

    int *d_input, *d_output;
    cudaMalloc(&d_input, N * sizeof(int));
    cudaMalloc(&d_output, 2 * sizeof(int));

    cudaMemcpy(d_input, h_input, N * sizeof(int), cudaMemcpyHostToDevice);


    cudaEvent_t start, stop;
    float milliseconds = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);

    sumKernel<<<1, 2>>>(d_input, d_output);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(h_output, d_output, 2 * sizeof(int), cudaMemcpyDeviceToHost);


    printf("Sum using Iteration (Thread 0): %d\n", h_output[0]);
    printf("Sum using Formula   (Thread 1): %d\n", h_output[1]);
    printf("GPU Kernel Execution Time: %.6f ms\n", milliseconds);


    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}

Writing sum.cu


In [2]:
!nvcc --gpu-architecture=sm_70 sum.cu -o sum
!./sum

Sum using Iteration (Thread 0): 523776
Sum using Formula   (Thread 1): 523776
GPU Kernel Execution Time: 0.225952 ms


In [3]:
#ASSIGNMENT 4 Q2
%%writefile merging.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

__device__ void merge(int *arr, int l, int m, int r, int *temp) {
    int i = l, j = m + 1, k = 0;

    while (i <= m && j <= r) {
        if (arr[i] <= arr[j]) {
            temp[k++] = arr[i++];
        } else {
            temp[k++] = arr[j++];
        }
    }

    while (i <= m) temp[k++] = arr[i++];
    while (j <= r) temp[k++] = arr[j++];

    for (i = l, k = 0; i <= r; i++, k++) {
        arr[i] = temp[k];
    }
}

__global__ void mergeSortKernel(int *arr, int n, int step) {
    extern __shared__ int temp[];
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int l = idx * step * 2;
    int m = l + step - 1;
    int r = min(l + step * 2 - 1, n - 1);

    if (l < n && m < n && r < n) {
        merge(arr, l, m, r, temp);
    }
}

void cudaMergeSort(int *h_arr, int n) {
    int *d_arr;
    size_t size = n * sizeof(int);

    // Allocate device memory
    cudaMalloc(&d_arr, size);
    cudaMemcpy(d_arr, h_arr, size, cudaMemcpyHostToDevice);

    // Kernel configuration
    dim3 threadsPerBlock(BLOCK_SIZE);
    dim3 blocksPerGrid((n + threadsPerBlock.x - 1) / threadsPerBlock.x);

    // Perform iterative merge sort
    for (int step = 1; step < n; step *= 2) {
        mergeSortKernel<<<blocksPerGrid, threadsPerBlock, BLOCK_SIZE * sizeof(int)>>>(d_arr, n, step);
        cudaDeviceSynchronize();
    }

    // Copy sorted array back to host
    cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_arr);
}

int main() {
    int n = 1000;
    int *arr = (int*)malloc(n * sizeof(int));

    // Initialize array with random values
    srand(time(NULL));
    for (int i = 0; i < n; i++) {
        arr[i] = rand() % 1000;
    }

    printf("Unsorted Array:\n");
    for (int i = 0; i < 10; i++) { // Print first 10 elements
        printf("%d ", arr[i]);
    }
    printf("\n");

    // Time CUDA Merge Sort
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    cudaMergeSort(arr, n);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);

    float milliseconds;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Sorted Array:\n");
    for (int i = 0; i < 10; i++) { // Print first 10 elements
        printf("%d ", arr[i]);
    }

    printf("\nCUDA Merge Sort Time: %.4f ms\n", milliseconds);

    free(arr);

    return 0;
}



Writing merging.cu


In [4]:
!nvcc -o merging merging.cu -lcurand -arch=sm_75
!./merging

Unsorted Array:
29 593 78 245 832 10 435 271 788 886 
Sorted Array:
29 593 78 245 832 10 435 271 788 886 
CUDA Merge Sort Time: 0.0000 ms


In [5]:
#ASSIGNMENT 5 Q1
%%writefile vec_add.cu
#include <stdio.h>
#include <cuda.h>

#define N 1024  // Size of vectors

// Declare vectors in unified memory (accessible from host and device)
__device__ __managed__ float A[N], B[N], C[N];

// CUDA kernel for vector addition
__global__ void vectorAdd() {
    int i = threadIdx.x + blockIdx.x * blockDim.x;  // Global thread ID
    if (i < N)
        C[i] = A[i] + B[i];  // Perform element-wise addition
}

// CUDA error checker utility
void check(cudaError_t err, const char *msg) {
    if (err != cudaSuccess) {
        printf("CUDA error %s: %s\n", msg, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

int main() {
    // Initialize vectors A and B on the host
    for (int i = 0; i < N; ++i) {
        A[i] = i * 1.0f;
        B[i] = i * 2.0f;
    }

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    float time_ms = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);  // Start timing

    // Launch vector addition kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>();  // Launch GPU kernel
    check(cudaGetLastError(), "Kernel launch");

    // Stop timing
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time_ms, start, stop);  // Calculate elapsed time in ms

    // Display first 5 elements of the result vector
    printf("C = [");
    for (int i = 0; i < 5; ++i) printf("%.1f ", C[i]);
    printf("...]\n");

    // Query device properties for theoretical bandwidth calculation
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);

    float memClock = prop.memoryClockRate * 1e3;   // Convert to Hz
    float busWidth = prop.memoryBusWidth;          // In bits
    float theoreticalBW = 2 * memClock * busWidth / 8 / 1e9; // GB/s (DDR hence x2)
    printf("Theoretical Bandwidth: %.2f GB/s\n", theoreticalBW);

    // Calculate actual bandwidth used by the kernel
    float totalBytes = 2 * N * sizeof(float) + N * sizeof(float); // A and B read, C written
    float measuredBW = totalBytes / (time_ms / 1000.0f) / 1e9;     // GB/s
    printf("Measured Bandwidth: %.2f GB/s\n", measuredBW);
    printf("Execution Time: %.4f ms\n", time_ms);

    return 0;
}


Writing vec_add.cu


In [7]:
!nvcc --gpu-architecture=sm_70 vec_add.cu -o vec_add
!./vec_add

C = [0.0 3.0 6.0 9.0 12.0 ...]
Theoretical Bandwidth: 320.06 GB/s
Measured Bandwidth: 0.03 GB/s
Execution Time: 0.3795 ms


In [None]:
!nvidia-smi


Wed Apr 16 11:53:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   50C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!apt-get update
!apt-get install -y cuda-toolkit-10-1
!export PATH=/usr/local/cuda-10.1/bin${PATH:+:${PATH}}
!export LD_LIBRARY_PATH=/usr/local/cuda-10.1/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

!nvcc --version

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [73.0 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,243 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,837 kB]
Get:13 http://archive.ubuntu.com/ub

In [None]:
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
!sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
!wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
!sudo dpkg -i cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
!sudo cp /var/cuda-repo-ubuntu2004-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
!sudo apt-get update
!sudo apt-get -y install cuda-toolkit-11-8

--2025-04-16 12:41:56--  https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.46.63.202, 23.46.63.200
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.46.63.202|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 190 [application/octet-stream]
Saving to: ‘cuda-ubuntu2004.pin’


2025-04-16 12:41:57 (55.9 MB/s) - ‘cuda-ubuntu2004.pin’ saved [190/190]

--2025-04-16 12:41:57--  https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 23.46.63.202, 23.46.63.200
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|23.46.63.202|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3181876424 (3.0G) [application/x-deb]
Saving to: ‘

In [None]:
!echo 'export PATH=/usr/local/cuda-11.8/bin${PATH:+:${PATH}}' >> ~/.bashrc
!echo 'export LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}' >> ~/.bashrc
%reset -f

In [None]:
!nvcc --version
# Should show: release 11.8, V11.8.89
!nvidia-smi
# Should show CUDA Version: 12.0+ (driver compatibility)

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Wed Apr 16 12:49:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                       