<a href="https://colab.research.google.com/github/awaisarif18/PDC-Assignments/blob/main/PDC_cuda_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CUDA Assignment 03
Mohammad Mehdi
SP22-BCS-020


In [None]:
# Cell 1 — Verify GPU and nvcc
import os
print("Note: Make sure Runtime -> Change runtime type -> GPU is selected.")
print("Python version:", os.sys.version)

# Run nvidia-smi and check nvcc
!nvidia-smi || echo "nvidia-smi not available"

# Check nvcc
!which nvcc && nvcc --version || echo "nvcc not found; make sure Colab GPU runtime selected"


Note: Make sure Runtime -> Change runtime type -> GPU is selected.
Python version: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
Thu Oct  2 07:41:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----

In [None]:
%%bash
# Cell 2 — Write CUDA file (cuda_demo.cu)
cat > cuda_demo.cu << 'EOF'
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>

#define N 1024

#define CUDA_CHECK(call) do {     cudaError_t err = call;     if (err != cudaSuccess) {         fprintf(stderr, "CUDA Error at %s:%d -> %s\n", __FILE__, __LINE__, cudaGetErrorString(err));         exit(EXIT_FAILURE);     } } while(0)

// kernel1: C = A + B
__global__ void kernel1(const int *A, const int *B, int *C, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) C[idx] = A[idx] + B[idx];
}

// kernel2: D = C * C
__global__ void kernel2(const int *C, int *D, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        int v = C[idx];
        D[idx] = v * v;
    }
}

// reduction (per-block shared-memory sum)
__global__ void block_reduce_sum(const int *in, long long *block_sums, int n) {
    extern __shared__ long long sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    long long x = 0;
    if (idx < n) x = in[idx];
    sdata[tid] = x;
    __syncthreads();

    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) sdata[tid] += sdata[tid + s];
        __syncthreads();
    }

    if (tid == 0) block_sums[blockIdx.x] = sdata[0];
}

int main() {
    printf("Starting CUDA demo with N=%d\n", N);

    // Host allocation
    int *h_A = (int*)malloc(N * sizeof(int));
    int *h_B = (int*)malloc(N * sizeof(int));
    int *h_C = (int*)malloc(N * sizeof(int));
    int *h_D = (int*)malloc(N * sizeof(int));
    if (!h_A || !h_B || !h_C || !h_D) { fprintf(stderr, "Host malloc failed\n"); return 1; }

    for (int i = 0; i < N; ++i) { h_A[i] = i; h_B[i] = 2*i; }

    // Device allocation
    int *d_A = nullptr, *d_B = nullptr, *d_C = nullptr, *d_D = nullptr;
    CUDA_CHECK(cudaMalloc((void**)&d_A, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc((void**)&d_B, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc((void**)&d_C, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc((void**)&d_D, N * sizeof(int)));

    // Copy host -> device
    CUDA_CHECK(cudaMemcpy(d_A, h_A, N * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_B, h_B, N * sizeof(int), cudaMemcpyHostToDevice));

    int threads = 256;
    int blocks = (N + threads - 1) / threads;

    // Launch kernels
    printf("Launching kernel1 (serial) ...\n");
    kernel1<<<blocks, threads>>>(d_A, d_B, d_C, N);
    CUDA_CHECK(cudaGetLastError());

    printf("Launching kernel2 (serial) ...\n");
    kernel2<<<blocks, threads>>>(d_C, d_D, N);
    CUDA_CHECK(cudaGetLastError());

    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaMemcpy(h_C, d_C, N * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_D, d_D, N * sizeof(int), cudaMemcpyDeviceToHost));

    printf("Samples (first 3, last 3):\n");
    for (int i : {0,1,10}) printf("i=%d A=%d B=%d C=%d D=%d\n", i, h_A[i], h_B[i], h_C[i], h_D[i]);
    printf("...\n");
    for (int i : {N-3, N-2, N-1}) printf("i=%d A=%d B=%d C=%d D=%d\n", i, h_A[i], h_B[i], h_C[i], h_D[i]);

    bool ok = true;
    for (int i = 0; i < N; ++i) {
        if (h_C[i] != h_A[i] + h_B[i]) { ok = false; break; }
        if (h_D[i] != h_C[i] * h_C[i]) { ok = false; break; }
    }
    printf("Validation %s\n", ok ? "PASSED" : "FAILED");

    // Reduction demo
    int threads_red = 256;
    int blocks_red = (N + threads_red -1)/threads_red;
    long long *d_block_sums=nullptr; long long *h_block_sums=(long long*)malloc(blocks_red * sizeof(long long));
    CUDA_CHECK(cudaMalloc((void**)&d_block_sums, blocks_red * sizeof(long long)));

    block_reduce_sum<<<blocks_red, threads_red, threads_red * sizeof(long long)>>>(d_D, d_block_sums, N);
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaMemcpy(h_block_sums, d_block_sums, blocks_red * sizeof(long long), cudaMemcpyDeviceToHost));

    long long gpu_sum = 0;
    for (int i = 0; i < blocks_red; ++i) gpu_sum += h_block_sums[i];

    long long cpu_sum = 0;
    for (int i = 0; i < N; ++i) cpu_sum += (long long)h_D[i];

    printf("Reduction sum GPU = %lld\n", gpu_sum);
    printf("Reduction sum CPU = %lld\n", cpu_sum);

    free(h_A); free(h_B); free(h_C); free(h_D); free(h_block_sums);
    CUDA_CHECK(cudaFree(d_A)); CUDA_CHECK(cudaFree(d_B)); CUDA_CHECK(cudaFree(d_C)); CUDA_CHECK(cudaFree(d_D));
    CUDA_CHECK(cudaFree(d_block_sums));
    printf("Done.\n");
    return 0;
}
EOF

nvcc -o cuda_demo cuda_demo.cu || { echo "nvcc compile failed"; exit 1; }

In [None]:
# Detect GPU model and driver info
import subprocess, shlex, sys

def run(cmd):
    try:
        out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.STDOUT).decode()
    except subprocess.CalledProcessError as e:
        out = e.output.decode()
    return out

print("nvidia-smi output:")
print(run("nvidia-smi -L"))
print()
print("Detailed nvidia-smi:")
print(run("nvidia-smi"))
print()
print("nvcc version (if available):")
print(run("nvcc --version"))


nvidia-smi output:
GPU 0: Tesla T4 (UUID: GPU-d95790f3-1089-fd7a-8c5d-a114687791ff)


Detailed nvidia-smi:
Thu Oct  2 07:41:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+--------------------------------

In [None]:
%%bash
echo "Recompiling with multiple gencode targets..."
# adjust file name if different
CU_FILE="cuda_demo.cu"
OUT_BIN="cuda_demo"

nvcc -O2 "$CU_FILE" -o "$OUT_BIN" \
  -gencode=arch=compute_70,code=sm_70 \
  -gencode=arch=compute_75,code=sm_75 \
  -gencode=arch=compute_80,code=sm_80 \
  -gencode=arch=compute_86,code=sm_86 \
  -gencode=arch=compute_90,code=compute_90 || { echo "nvcc compile FAILED"; exit 1; }

echo "Compile finished. Running the program..."
./"$OUT_BIN"


Recompiling with multiple gencode targets...
Compile finished. Running the program...
Starting CUDA demo with N=1024
Launching kernel1 (serial) ...
Launching kernel2 (serial) ...
Samples (first 3, last 3):
i=0 A=0 B=0 C=0 D=0
i=1 A=1 B=2 C=3 D=9
i=10 A=10 B=20 C=30 D=900
...
i=1021 A=1021 B=2042 C=3063 D=9381969
i=1022 A=1022 B=2044 C=3066 D=9400356
i=1023 A=1023 B=2046 C=3069 D=9418761
Validation PASSED
Reduction sum GPU = 3216508416
Reduction sum CPU = 3216508416
Done.
