<a href="https://colab.research.google.com/github/abhinavmarkanda/UCS547-Accelerated-Data-Science/blob/main/Assignment3(UCS547).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
##Q1
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define N 1024

__global__ void vectorAdd(float *A, float *B, float *C, int n)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
    {
        C[i] = A[i] + B[i];
    }
}

int main()
{
    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;

    size_t size = N * sizeof(float);

    // Allocate host memory
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);
    h_C = (float*)malloc(size);

    // Initialize vectors
    for (int i = 0; i < N; i++)
    {
        h_A[i] = i * 1.0f;
        h_B[i] = i * 2.0f;
    }

    // Allocate device memory
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result back
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("First 10 elements:\n");
    for (int i = 0; i < 10; i++)
    {
        printf("C[%d] = %f\n", i, h_C[i]);
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}


Writing vector_add.cu


In [None]:
!nvcc vector_add.cu -o vector_add




In [None]:
!./vector_add


First 10 elements:
C[0] = 0.000000
C[1] = 3.000000
C[2] = 6.000000
C[3] = 9.000000
C[4] = 12.000000
C[5] = 15.000000
C[6] = 18.000000
C[7] = 21.000000
C[8] = 24.000000
C[9] = 27.000000


In [None]:
##Q2
%%writefile thrust_vector_add.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>

#define N 1024

int main()
{
    // Create host vectors
    thrust::host_vector<float> h_A(N);
    thrust::host_vector<float> h_B(N);

    // Initialize vectors
    for(int i = 0; i < N; i++)
    {
        h_A[i] = i * 1.0f;
        h_B[i] = i * 2.0f;
    }

    // Copy to device vectors
    thrust::device_vector<float> d_A = h_A;
    thrust::device_vector<float> d_B = h_B;
    thrust::device_vector<float> d_C(N);

    // Perform vector addition using thrust::plus
    thrust::transform(d_A.begin(), d_A.end(),
                      d_B.begin(),
                      d_C.begin(),
                      thrust::plus<float>());

    // Copy result back to host
    thrust::host_vector<float> h_C = d_C;

    // Print first 10 results
    std::cout << "First 10 elements of C:\n";
    for(int i = 0; i < 10; i++)
    {
        std::cout << "C[" << i << "] = " << h_C[i] << std::endl;
    }

    return 0;
}


Writing thrust_vector_add.cu


In [None]:
!nvcc thrust_vector_add.cu -o thrust_vector_add




In [None]:
!./thrust_vector_add


First 10 elements of C:
C[0] = 0
C[1] = 3
C[2] = 6
C[3] = 9
C[4] = 12
C[5] = 15
C[6] = 18
C[7] = 21
C[8] = 24
C[9] = 27


In [None]:
##Q3
%%writefile thrust_dot_product.cu
#include <iostream>
#include <chrono>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>

#define N 1024

int main()
{
    // ---------------- CPU PART ----------------
    float A[N], B[N];

    for(int i = 0; i < N; i++)
    {
        A[i] = 1.0f;
        B[i] = 2.0f;
    }

    auto cpu_start = std::chrono::high_resolution_clock::now();

    float cpu_result = 0.0f;
    for(int i = 0; i < N; i++)
    {
        cpu_result += A[i] * B[i];
    }

    auto cpu_end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> cpu_time = cpu_end - cpu_start;

    // ---------------- GPU PART (THRUST) ----------------

    thrust::host_vector<float> h_A(N);
    thrust::host_vector<float> h_B(N);

    for(int i = 0; i < N; i++)
    {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }

    thrust::device_vector<float> d_A = h_A;
    thrust::device_vector<float> d_B = h_B;

    cudaDeviceSynchronize();
    auto gpu_start = std::chrono::high_resolution_clock::now();

    float gpu_result = thrust::inner_product(
                            d_A.begin(),
                            d_A.end(),
                            d_B.begin(),
                            0.0f);

    cudaDeviceSynchronize();
    auto gpu_end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> gpu_time = gpu_end - gpu_start;

    // ---------------- RESULTS ----------------

    std::cout << "Dot Product (CPU)  = " << cpu_result << std::endl;
    std::cout << "CPU Time (ms)      = " << cpu_time.count() << std::endl;

    std::cout << "Dot Product (GPU)  = " << gpu_result << std::endl;
    std::cout << "GPU Time (ms)      = " << gpu_time.count() << std::endl;

    return 0;
}


Overwriting thrust_dot_product.cu


In [None]:
!nvcc thrust_dot_product.cu -o thrust_dot_product




In [None]:
!./thrust_dot_product


Dot Product (CPU)  = 2048
CPU Time (ms)      = 0.003335
Dot Product (GPU)  = 2048
GPU Time (ms)      = 1.5498


In [6]:
##Q4
import numpy as np
from numba import cuda

# -----------------------------
# Matrix size
# -----------------------------
N = 16

# -----------------------------
# CUDA Kernel
# -----------------------------
@cuda.jit
def matrixMul(A, B, C):
    row, col = cuda.grid(2)

    if row < N and col < N:
        temp = 0.0
        for k in range(N):
            temp += A[row, k] * B[k, col]
        C[row, col] = temp


# -----------------------------
# Host Code
# -----------------------------
# Initialize matrices (same as your C code: all ones)
h_A = np.ones((N, N), dtype=np.float32)
h_B = np.ones((N, N), dtype=np.float32)
h_C = np.zeros((N, N), dtype=np.float32)

# Copy to device
d_A = cuda.to_device(h_A)
d_B = cuda.to_device(h_B)
d_C = cuda.to_device(h_C)

# -----------------------------
# Launch configuration
# -----------------------------
threads_per_block = (16, 16)
blocks_per_grid_x = (N + threads_per_block[0] - 1) // threads_per_block[0]
blocks_per_grid_y = (N + threads_per_block[1] - 1) // threads_per_block[1]
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

# Launch kernel
matrixMul[blocks_per_grid, threads_per_block](d_A, d_B, d_C)

# Copy back
d_C.copy_to_host(h_C)

# Output
print("C[0][0] =", h_C[0, 0])

C[0][0] = 16.0




In [7]:
!pip -q install numba cupy-cuda12x cudf-cu12 --extra-index-url=https://pypi.nvidia.com

In [8]:
##Q5
import numpy as np
import cupy as cp
import cudf
from numba import cuda
import time

# ------------------------------------------------
# Problem size
# ------------------------------------------------
N = 5_000_000

# ------------------------------------------------
# Initialize data
# ------------------------------------------------
h_A = np.random.rand(N).astype(np.float32)
h_B = np.random.rand(N).astype(np.float32)

# =================================================
# ✅ 1. CPU Sequential (NumPy)
# =================================================
start = time.time()
h_C_cpu = h_A + h_B
cpu_time = time.time() - start

print("CPU Time:", cpu_time)

# =================================================
# ✅ 2. CUDA Kernel (Numba)
# =================================================
@cuda.jit
def vecAdd(A, B, C):
    i = cuda.grid(1)
    if i < A.size:
        C[i] = A[i] + B[i]

# Copy to device
d_A = cuda.to_device(h_A)
d_B = cuda.to_device(h_B)
d_C = cuda.device_array_like(h_A)

threads = 256
blocks = (N + threads - 1) // threads

start = time.time()
vecAdd[blocks, threads](d_A, d_B, d_C)
cuda.synchronize()
cuda_time = time.time() - start

h_C_cuda = d_C.copy_to_host()

print("CUDA Kernel Time:", cuda_time)

# =================================================
# ✅ 3. Thrust-like (CuPy GPU vectorized)
# =================================================
cp_A = cp.asarray(h_A)
cp_B = cp.asarray(h_B)

start = time.time()
cp_C = cp_A + cp_B
cp.cuda.Stream.null.synchronize()
thrust_time = time.time() - start

print("Thrust (CuPy) Time:", thrust_time)

# =================================================
# ✅ 4. RAPIDS (cuDF)
# =================================================
gdf = cudf.DataFrame({
    "A": h_A,
    "B": h_B
})

start = time.time()
gdf["C"] = gdf["A"] + gdf["B"]
rapids_time = time.time() - start

print("RAPIDS Time:", rapids_time)

# =================================================
# Summary
# =================================================
print("\n===== SUMMARY =====")
print(f"CPU Time       : {cpu_time:.6f} sec")
print(f"CUDA Kernel    : {cuda_time:.6f} sec")
print(f"Thrust (CuPy)  : {thrust_time:.6f} sec")
print(f"RAPIDS (cuDF)  : {rapids_time:.6f} sec")

CPU Time: 0.012083053588867188
CUDA Kernel Time: 0.1880950927734375
Thrust (CuPy) Time: 0.00116729736328125
RAPIDS Time: 0.005854368209838867

===== SUMMARY =====
CPU Time       : 0.012083 sec
CUDA Kernel    : 0.188095 sec
Thrust (CuPy)  : 0.001167 sec
RAPIDS (cuDF)  : 0.005854 sec


In [9]:
##Q6
!pip -q install cupy-cuda12x
import cupy as cp

# --------------------------------
# Step 1: Create vector on GPU
# --------------------------------
d_vec = cp.arange(1, 11, dtype=cp.int32)  # [1..10]

# --------------------------------
# Step 2: Compute sum on GPU
# --------------------------------
gpu_sum = cp.sum(d_vec)

# --------------------------------
# Step 3: Print result
# --------------------------------
print("Vector:", d_vec)
print("Sum =", int(gpu_sum))

Vector: [ 1  2  3  4  5  6  7  8  9 10]
Sum = 55


In [13]:
##Q7
!pip -q install cupy-cuda12x
import cupy as cp

d_vec = cp.array([7, 2, 9, 1, 5, 3, 8, 4], dtype=cp.int32)

print("Before sorting:")
print(d_vec)

# FIX: store the result
d_vec = cp.sort(d_vec)

print("\nAfter sorting:")
print(d_vec)

Before sorting:
[7 2 9 1 5 3 8 4]

After sorting:
[1 2 3 4 5 7 8 9]
