1. Identify !, %, and %% used in cell in Google Colab.

In [1]:
!nvidia-smi #Used to run terminal commands
!nvcc --version
!ls

Sun Feb  1 16:28:45 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%time x = sum(range(1000000)) #Affects only one line
%cd /content


CPU times: user 20.7 ms, sys: 0 ns, total: 20.7 ms
Wall time: 20.8 ms
/content


In [3]:
#Affects whole cell

%%writefile hello.cu
#include <stdio.h>
int main() {
  printf("Hello CUDA");

  return 0;
  }


Writing hello.cu


2. Identify all key nvidia-smi commands with multiple options

In [4]:
!nvidia-smi #Basic GPU Information

Sun Feb  1 16:28:45 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
!nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv #Query Specific Fields


name, memory.total [MiB], memory.used [MiB]
Tesla T4, 15360 MiB, 0 MiB


In [6]:
!nvidia-smi pmon -c 1 #Show Running GPU Processes


# gpu         pid   type     sm    mem    enc    dec    jpg    ofa    command 
# Idx           #    C/G      %      %      %      %      %      %    name 
    0          -     -      -      -      -      -      -      -    -              


In [7]:
!nvidia-smi --query-gpu=driver_version --format=csv #Driver & CUDA Version


driver_version
550.54.15


3. Debug common CUDA errors (zero output, incorrect indexing, PTX errors)

In [13]:
%%writefile sum.cu
#include<stdio.h>

__global__ void sum(int *a, int *b, int *c){
  *c = *a + *b;
};

int main(void){
  int a, b, c;
  int *d_a, *d_b, *d_c;
  int size = sizeof(int);

  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  a = 2;
  b = 7;

  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

  sum<<<1,1>>>(d_a, d_b, d_c);

  cudaDeviceSynchronize();  // solves zero output

  cudaError_t err;
  err = cudaGetLastError();   // solves PTX errors
    if (err != cudaSuccess) {
        printf("Kernel launch error: %s\n", cudaGetErrorString(err));
    }

  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  printf("a = %d\n", a);
  printf("b = %d\n", b);
  printf("a + b = %d\n", c);

  return 0;
}

Overwriting sum.cu


In [14]:
#solves zero output
!nvcc -arch=sm_75 sum.cu -o sum
! ./sum

a = 2
b = 7
a + b = 9


4. Write a CUDA C/C++ program to demonstrate GPU kernel execu'on and thread indexing.

a. Launch a CUDA kernel using: 1 block and 8 threads

b. Each thread must print: Hello from GPU thread <global_thread_id>

c. Compute the global thread ID using: global_thread_id = blockIdx.x * blockDim.x + threadIdx.x

d. Clearly separate: Host code (CPU) & Device code (GPU kernel)

In [24]:
%%writefile hello_threads.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void GPU() {

    int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;

    printf("Hello from GPU thread %d\n", global_thread_id);
}

int main() {

    GPU<<<1, 8>>>();

    cudaDeviceSynchronize();

    return 0;
}


Overwriting hello_threads.cu


In [25]:
!nvcc hello_threads.cu -o hello_threads \
    -arch=compute_75 -code=sm_75
!./hello_threads

Hello from GPU thread 0
Hello from GPU thread 1
Hello from GPU thread 2
Hello from GPU thread 3
Hello from GPU thread 4
Hello from GPU thread 5
Hello from GPU thread 6
Hello from GPU thread 7


5. Write a CUDA program to demonstrate host and device memory separation.

a. Create an integer array of size 5 on the host (CPU).

b. Allocate corresponding memory on the device (GPU) using cudaMalloc().

c. Copy data from host to device using cudaMemcpy().

d. Launch a kernel where GPU threads print values from device memory.

e. Copy the data back from device to host and print it on CPU

In [28]:
%%writefile host_device_memory.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void printArray(int *d_arr) {
    int id = threadIdx.x;
    if (id < 5) {
        printf("GPU thread %d: value = %d\n", id, d_arr[id]);
    }
}

int main() {
    int h_arr[5] = {10, 20, 30, 40, 50};
    int *d_arr;
    int size = 5 * sizeof(int);

    cudaMalloc((void**)&d_arr, size);
    cudaMemcpy(d_arr, h_arr, size, cudaMemcpyHostToDevice);

    printArray<<<1, 5>>>(d_arr);
    cudaDeviceSynchronize();

    cudaMemcpy(h_arr, d_arr, size, cudaMemcpyDeviceToHost);

    printf("\n");
    for (int i = 0; i < 5; i++) {
        printf("CPU: h_arr[%d] = %d\n", i, h_arr[i]);
    }

    cudaFree(d_arr);
    return 0;
}


Overwriting host_device_memory.cu


In [29]:
!nvcc host_device_memory.cu -o host_device_memory \
    -gencode arch=compute_75,code=sm_75
!./host_device_memory


GPU thread 0: value = 10
GPU thread 1: value = 20
GPU thread 2: value = 30
GPU thread 3: value = 40
GPU thread 4: value = 50

CPU: h_arr[0] = 10
CPU: h_arr[1] = 20
CPU: h_arr[2] = 30
CPU: h_arr[3] = 40
CPU: h_arr[4] = 50


6. Compare CPU 'mes of List/tuple with Numpy arrays.



In [32]:
import numpy as np
import time

N = 10000000

lst = list(range(N))
start = time.time()
[x * 2 for x in lst]
print("List time:", time.time() - start)

arr = np.arange(N)
start = time.time()
arr * 2
print("NumPy array time:", time.time() - start)


List time: 0.5246992111206055
NumPy array time: 0.024084806442260742
