In [None]:
!nvidia-smi

Thu Dec 18 18:19:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
%%writefile vector_add.cu
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>

// CUDA Kernel
__global__ void vectorAdd(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

// Main
int main() {
    int n = 10000000;
    size_t size = n * sizeof(float);

    // Host memory
    float *h_a = new float[n];
    float *h_b = new float[n];
    float *h_c = new float[n];
    float *h_ref = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i]   = i * 0.5f;
        h_b[i]   = i * 0.25f;
        h_ref[i] = h_a[i] + h_b[i];
    }

    // Device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Kernel launch
    int blockSize = 512;
    int gridSize  = (n + blockSize - 1) / blockSize;

    vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Error checking
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "Kernel launch error: "
                  << cudaGetErrorString(err) << std::endl;
        return -1;
    }

    cudaDeviceSynchronize();

    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Verification
    bool passed = true;
    for (int i = 0; i < n; i++) {
        if (fabs(h_c[i] - h_ref[i]) > 1e-5) {
            std::cout << "Mismatch at index " << i
                      << " | GPU: " << h_c[i]
                      << " | CPU: " << h_ref[i] << std::endl;
            passed = false;
            break;
        }
    }

    if (passed){
        std::cout << "Vector Addition PASSED" << std::endl;
        std::cout << "n = " << n
          << ", blockSize = " << blockSize
          << ", gridSize = " << gridSize
          << ", totalThreads = " << gridSize * blockSize
          << std::endl;
    }
    else
        std::cout << "Vector Addition FAILED" << std::endl;

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    delete[] h_a;
    delete[] h_b;
    delete[] h_c;
    delete[] h_ref;

    return 0;
}


Overwriting vector_add.cu


In [None]:
# !nvcc vector_add.cu -o vector_add
!nvcc vector_add.cu -o vector_add \
  -gencode arch=compute_75,code=sm_75 \
  -Xptxas -v

!./vector_add


ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z9vectorAddPKfS0_Pfi' for 'sm_75'
ptxas info    : Function properties for _Z9vectorAddPKfS0_Pfi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 12 registers, 380 bytes cmem[0]
Vector Addition PASSED
n = 10000000, blockSize = 512, gridSize = 19532, totalThreads = 10000384


In [None]:
%%writefile vector_add2.cu
//task3
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>

// Kernel
__global__ void vectorAdd2(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

// Main
int main() {
    int n = 10000000;   // Change for Task 2 / Task 3 experiments
    size_t size = n * sizeof(float);

    // Host memory
    float *h_a = new float[n];
    float *h_b = new float[n];
    float *h_c = new float[n];
    float *h_ref = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i] = 0.5f * i;
        h_b[i] = 0.25f * i;
        h_ref[i] = h_a[i] + h_b[i];
    }

    // Device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 512;
    int gridSize  = (n + blockSize - 1) / blockSize;

    std::cout << "n = " << n
              << ", blockSize = " << blockSize
              << ", gridSize = " << gridSize
              << ", totalThreads = " << gridSize * blockSize
              << std::endl;

    // CUDA Timing using Events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    vectorAdd2<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);

    float elapsed_ms = 0.0f;
    cudaEventElapsedTime(&elapsed_ms, start, stop);

    std::cout << "Kernel execution time: "
              << elapsed_ms << " ms" << std::endl;

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Copy back
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Correctness Check
    bool passed = true;
    for (int i = 0; i < n; i++) {
        if (fabs(h_c[i] - h_ref[i]) > 1e-5) {
            std::cout << "Mismatch at index " << i << std::endl;
            passed = false;
            break;
        }
    }

    if (passed)
        std::cout << "Vector Addition PASSED" << std::endl;
    else
        std::cout << "Vector Addition FAILED " << std::endl;

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    delete[] h_a;
    delete[] h_b;
    delete[] h_c;
    delete[] h_ref;

    return 0;
}


Overwriting vector_add2.cu


In [None]:
# !nvcc vector_add.cu -o vector_add
!nvcc vector_add2.cu -o vector_add2 \
  -gencode arch=compute_75,code=sm_75 \
  -Xptxas -v

!./vector_add2

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z10vectorAdd2PKfS0_Pfi' for 'sm_75'
ptxas info    : Function properties for _Z10vectorAdd2PKfS0_Pfi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 12 registers, 380 bytes cmem[0]
n = 10000000, blockSize = 512, gridSize = 19532, totalThreads = 10000384
Kernel execution time: 0.558368 ms
Vector Addition PASSED


In [None]:
%%writefile multiply_scale.cu
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>

// Kernel
__global__ void multiplyScale(const float* a, const float* b,
                              float* c, float alpha, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = alpha * a[idx] * b[idx];
    }
}

// Main
int main() {
    int n = 100000;
    float alpha = 2.5f;
    size_t size = n * sizeof(float);

    // Host memory
    float *h_a   = new float[n];
    float *h_b   = new float[n];
    float *h_c   = new float[n];
    float *h_ref = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i]   = 0.01f * i;
        h_b[i]   = 0.02f * i;
        h_ref[i] = alpha * h_a[i] * h_b[i];
    }

    // Device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 128;
    int gridSize  = (n + blockSize - 1) / blockSize;

    multiplyScale<<<gridSize, blockSize>>>(d_a, d_b, d_c, alpha, n);

    // Error check + sync
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "Kernel error: "
                  << cudaGetErrorString(err) << std::endl;
        return -1;
    }
    cudaDeviceSynchronize();

    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Verification
    bool passed = true;
    for (int i = 0; i < n; i++) {
        if (fabs(h_c[i] - h_ref[i]) > 1e-5) {
            std::cout << "Mismatch at index " << i
                      << " GPU: " << h_c[i]
                      << " CPU: " << h_ref[i] << std::endl;
            passed = false;
            break;
        }
    }

    if (passed)
        std::cout << "Multiply & Scale PASSED\n";
    else
        std::cout << "Multiply & Scale FAILED\n";

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    delete[] h_a;
    delete[] h_b;
    delete[] h_c;
    delete[] h_ref;

    return 0;
}


Writing multiply_scale.cu


In [None]:
# !nvcc multiply_scale.cu -o multiply_scale
!nvcc multiply_scale.cu -o multiply_scale \
  -gencode arch=compute_75,code=sm_75 \
  -Xptxas -v
!./multiply_scale

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z13multiplyScalePKfS0_Pffi' for 'sm_75'
ptxas info    : Function properties for _Z13multiplyScalePKfS0_Pffi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 12 registers, 384 bytes cmem[0]
Multiply & Scale PASSED


In [None]:
%%writefile relu.cu
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>

// Kernel
__global__ void relu(const float* x, float* y, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        y[idx] = (x[idx] > 0.0f) ? x[idx] : 0.0f;
    }
}

// Main
int main() {
    int n = 100000;
    size_t size = n * sizeof(float);

    // Host memory
    float *h_x   = new float[n];
    float *h_y   = new float[n];
    float *h_ref = new float[n];

    for (int i = 0; i < n; i++) {
        h_x[i]   = (i % 3 == 0) ? -i * 0.1f : i * 0.1f;
        h_ref[i] = (h_x[i] > 0.0f) ? h_x[i] : 0.0f;
    }

    // Device memory
    float *d_x, *d_y;
    cudaMalloc(&d_x, size);
    cudaMalloc(&d_y, size);

    cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize  = (n + blockSize - 1) / blockSize;

    relu<<<gridSize, blockSize>>>(d_x, d_y, n);

    // Error check + sync
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "Kernel error: "
                  << cudaGetErrorString(err) << std::endl;
        return -1;
    }
    cudaDeviceSynchronize();

    cudaMemcpy(h_y, d_y, size, cudaMemcpyDeviceToHost);

    // Verification
    bool passed = true;
    for (int i = 0; i < n; i++) {
        if (fabs(h_y[i] - h_ref[i]) > 1e-5) {
            std::cout << "Mismatch at index " << i
                      << " GPU: " << h_y[i]
                      << " CPU: " << h_ref[i] << std::endl;
            passed = false;
            break;
        }
    }

    if (passed)
        std::cout << "ReLU PASSED\n";
    else
        std::cout << "ReLU FAILED\n";

    cudaFree(d_x);
    cudaFree(d_y);
    delete[] h_x;
    delete[] h_y;
    delete[] h_ref;

    return 0;
}


Overwriting relu.cu


In [None]:
# !nvcc relu.cu -o relu
!nvcc relu.cu -o relu \
  -gencode arch=compute_75,code=sm_75 \
  -Xptxas -v
!./relu

ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z4reluPKfPfi' for 'sm_75'
ptxas info    : Function properties for _Z4reluPKfPfi
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 372 bytes cmem[0]
ReLU PASSED
