<a href="https://colab.research.google.com/github/Yuxuan-Zhang-Dexter/cuda-practice/blob/main/cuda_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Installation

In [None]:
!nvidia-smi

Wed Jan 10 21:00:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
# Install the extension
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

# Load the extension into Colab
%load_ext nvcc_plugin

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ei2_zvyh
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ei2_zvyh
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0d2ab99cccbbc682722e708515fe9c4cfc50185a
  Preparing metadata (setup.py) ... [?25l[?25hdone
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


# Intro to CUDA code

In [None]:
### First Compile Cuda Code
%%cu
#include <iostream>
int main( void ) {
 printf( "Hello, World!\n" );
 return 0;
}

Hello, World!



In [None]:
### Kernal Call - kernel function runs on device, main function runs on host
%%cu
#include <iostream>

__global__ void kernel() {
    // Kernel code goes here
}

int main() {
    kernel<<<1,1>>>(); // Launch the kernel
    printf("Hello, World!\n");
    return 0;
}


Hello, World!



In [None]:
### passing parameter to the device
%%cu
#include <iostream>
__global__ void add( int a, int b, int *c ) {
 *c = a + b;
}
int main( void ) {
 int c;
 int *dev_c;
 cudaMalloc( (void**)&dev_c, sizeof(int) ); // allocate int space on the device
 add<<<1,1>>>( 2, 7, dev_c );
 cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost ); // read the content from device to host by using two pointers
 printf( "2 + 7 = %d\n", c );
 cudaFree( dev_c ); // cuda free device memory
 return 0;
}

2 + 7 = 9



In [None]:
### print device info
%%cu
#include <iostream>

int main(void) {
    cudaDeviceProp prop;
    int count;
    cudaGetDeviceCount(&count);

    for (int i = 0; i < count; i++) {
        cudaGetDeviceProperties(&prop, i);

        printf("--- General Information for device %d ---\n", i);
        printf("Name: %s\n", prop.name);
        printf("Compute capability: %d.%d\n", prop.major, prop.minor);
        printf("Clock rate: %d\n", prop.clockRate);
        printf("Device copy overlap: ");
        if (prop.deviceOverlap)
            printf("Enabled\n");
        else
            printf("Disabled\n");
        printf("Kernel execution timeout: ");
        if (prop.kernelExecTimeoutEnabled)
            printf("Enabled\n");
        else
            printf("Disabled\n");

        printf("--- Memory Information for device %d ---\n", i);
        printf("Total global mem: %ld\n", prop.totalGlobalMem);
        printf("Total constant Mem: %ld\n", prop.totalConstMem);
        printf("Max mem pitch: %ld\n", prop.memPitch);
        printf("Texture Alignment: %ld\n", prop.textureAlignment);

        printf("--- MP Information for device %d ---\n", i);
        printf("Multiprocessor count: %d\n", prop.multiProcessorCount);
        printf("Shared mem per mp: %ld\n", prop.sharedMemPerBlock);
        printf("Registers per mp: %d\n", prop.regsPerBlock);
        printf("Threads in warp: %d\n", prop.warpSize);
        printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
        printf("Max thread dimensions: (%d, %d, %d)\n",
               prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("Max grid dimensions: (%d, %d, %d)\n",
               prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("\n");
    }

    return 0;
}


--- General Information for device 0 ---
Name: Tesla T4
Compute capability: 7.5
Clock rate: 1590000
Device copy overlap: Enabled
Kernel execution timeout: Disabled
--- Memory Information for device 0 ---
Total global mem: 15835660288
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
--- MP Information for device 0 ---
Multiprocessor count: 40
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)




In [None]:
### look for a device satisfied a cuda version
%%cu
#include <iostream>
int main( void ) {
    cudaDeviceProp prop;
    int dev;
    cudaGetDevice( &dev );
    printf( "ID of current CUDA device: %d\n", dev );
    memset( &prop, 0, sizeof( cudaDeviceProp ) );
    prop.major = 1;
    prop.minor = 3;
    cudaChooseDevice( &dev, &prop );
    printf( "ID of CUDA device closest to revision 1.3: %d\n", dev );
    cudaSetDevice( dev );
}

ID of current CUDA device: 0
ID of CUDA device closest to revision 1.3: 0



# Parallel Computing on CPU and GPU

In [None]:
### CPU Vector SUMs - one cpu: [0, 1, 2, ..., 10]; two cpus: [0, 2, 4, ..., 10], [1, 3, 5, ..., 9] in tid
%%cu
#include <iostream>

#define N 10

// Function to add the elements of two arrays
void add(int *a, int *b, int *c) {
    int tid = 0; // this is CPU zero, so we start at zero
    while (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += 1; // we have one CPU, so we increment by one
    }
}

int main(void) {
    int a[N], b[N], c[N];

    // fill the arrays 'a' and 'b' on the CPU
    for (int i = 0; i < N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    add(a, b, c);

    // display the results
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    return 0;
}


0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72



In [None]:
### GPU VECTORS SUMS - BLOCKS
%%cu
#include <iostream>
#include <cuda_runtime.h>
#define N 10

__global__ void add(int *a, int *b, int *c) {
    int tid = blockIdx.x; // handle the data at this index
    printf("blockIdx is %d \n", tid);
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main(void) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // Allocate the memory on the GPU
    cudaMalloc((void**)&dev_a, N * sizeof(int));
    cudaMalloc((void**)&dev_b, N * sizeof(int));
    cudaMalloc((void**)&dev_c, N * sizeof(int));

    // Fill the arrays 'a' and 'b' on the CPU
    for (int i = 0; i < N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // Copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel on the GPU, N is number of blocks. In each block, it will run each thread in the same way defined in add kernel. <<<number_of_blocks, number_of_threads_per_block>>>
    add<<<N,1>>>(dev_a, dev_b, dev_c);

    // Copy the array 'c' back from the GPU to the CPU
    cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Display the results
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    // Free the memory allocated on the GPU
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}





blockIdx is 2 
blockIdx is 7 
blockIdx is 3 
blockIdx is 8 
blockIdx is 1 
blockIdx is 0 
blockIdx is 6 
blockIdx is 5 
blockIdx is 4 
blockIdx is 9 
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72



In [None]:
### GPU VECTORS SUMS - Threads (only kernel method chhanged)
#### the hardware block limit is 65535 and check 'maxThreadsPerBlock' the number of threads per block
%%cu
#include <iostream>
#include <cuda_runtime.h>
#define N 10

__global__ void add(int *a, int *b, int *c) {
    int tid = threadIdx.x; // handle the data at this index
    printf("threadIdx is %d \n", tid);
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main(void) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // Allocate the memory on the GPU
    cudaMalloc((void**)&dev_a, N * sizeof(int));
    cudaMalloc((void**)&dev_b, N * sizeof(int));
    cudaMalloc((void**)&dev_c, N * sizeof(int));

    // Fill the arrays 'a' and 'b' on the CPU
    for (int i = 0; i < N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // Copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel on the GPU, N is number of blocks. In each block, it will run each thread in the same way defined in add kernel. <<<number_of_blocks, number_of_threads_per_block>>>
    add<<<1,N>>>(dev_a, dev_b, dev_c);

    // Copy the array 'c' back from the GPU to the CPU
    cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Display the results
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    // Free the memory allocated on the GPU
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}


threadIdx is 0 
threadIdx is 1 
threadIdx is 2 
threadIdx is 3 
threadIdx is 4 
threadIdx is 5 
threadIdx is 6 
threadIdx is 7 
threadIdx is 8 
threadIdx is 9 
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72



Explanation of Thread, Block, and Grid:

gridDim is two-dimensional to store the number of blocks along each dimensional and blockDim itself is three-dimension, which stores the number of threads along each dimensional.

![Thread, Block, and Grid in One Kernel](https://nyu-cds.github.io/python-gpu/fig/02-threadmapping.png)

In [None]:
### GPU SUMS OF A LONGER VECTOR - BLOCK + THREAD (only kernel method chhanged)
%%cu
#include <iostream>
#include <cuda_runtime.h>
#define N 512

__global__ void add(int *a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x; // handle the data at this index
    // printf("Idx is %d \n", tid);
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main(void) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // Allocate the memory on the GPU
    cudaMalloc((void**)&dev_a, N * sizeof(int));
    cudaMalloc((void**)&dev_b, N * sizeof(int));
    cudaMalloc((void**)&dev_c, N * sizeof(int));

    // Fill the arrays 'a' and 'b' on the CPU
    for (int i = 0; i < N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // Copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel on the GPU, N is number of blocks. In each block, it will run each thread in the same way defined in add kernel. <<<number_of_blocks, number_of_threads_per_block>>>
    add<<<(N+127)/128, 128>>>(dev_a, dev_b, dev_c);

    // Copy the array 'c' back from the GPU to the CPU
    cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Display the results
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    // Free the memory allocated on the GPU
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}


0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72
-10 + 100 = 90
-11 + 121 = 110
-12 + 144 = 132
-13 + 169 = 156
-14 + 196 = 182
-15 + 225 = 210
-16 + 256 = 240
-17 + 289 = 272
-18 + 324 = 306
-19 + 361 = 342
-20 + 400 = 380
-21 + 441 = 420
-22 + 484 = 462
-23 + 529 = 506
-24 + 576 = 552
-25 + 625 = 600
-26 + 676 = 650
-27 + 729 = 702
-28 + 784 = 756
-29 + 841 = 812
-30 + 900 = 870
-31 + 961 = 930
-32 + 1024 = 992
-33 + 1089 = 1056
-34 + 1156 = 1122
-35 + 1225 = 1190
-36 + 1296 = 1260
-37 + 1369 = 1332
-38 + 1444 = 1406
-39 + 1521 = 1482
-40 + 1600 = 1560
-41 + 1681 = 1640
-42 + 1764 = 1722
-43 + 1849 = 1806
-44 + 1936 = 1892
-45 + 2025 = 1980
-46 + 2116 = 2070
-47 + 2209 = 2162
-48 + 2304 = 2256
-49 + 2401 = 2352
-50 + 2500 = 2450
-51 + 2601 = 2550
-52 + 2704 = 2652
-53 + 2809 = 2756
-54 + 2916 = 2862
-55 + 3025 = 2970
-56 + 3136 = 3080
-57 + 3249 = 3192
-58 + 3364 = 3306
-59 + 3481 = 3422
-60 + 3600 = 3540
-61 + 3

In [None]:
### GPU SUMS OF ARBITRARILY LONG VECTORS ((only kernel method chhanged))
#### the number of grid plays a similar role like the number of core.
%%cu
#include <iostream>
#include <cuda_runtime.h>
#define N 512

__global__ void add(int *a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x; // handle the data at this index
    // printf("Idx is %d \n", tid);
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }
}

int main(void) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // Allocate the memory on the GPU
    cudaMalloc((void**)&dev_a, N * sizeof(int));
    cudaMalloc((void**)&dev_b, N * sizeof(int));
    cudaMalloc((void**)&dev_c, N * sizeof(int));

    // Fill the arrays 'a' and 'b' on the CPU
    for (int i = 0; i < N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // Copy the arrays 'a' and 'b' to the GPU
    cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // Launch the kernel on the GPU, N is number of blocks. In each block, it will run each thread in the same way defined in add kernel. <<<number_of_blocks, number_of_threads_per_block>>>
    add<<<128, 128>>>(dev_a, dev_b, dev_c);

    // Copy the array 'c' back from the GPU to the CPU
    cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Display the results
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    // Free the memory allocated on the GPU
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}


0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72
-10 + 100 = 90
-11 + 121 = 110
-12 + 144 = 132
-13 + 169 = 156
-14 + 196 = 182
-15 + 225 = 210
-16 + 256 = 240
-17 + 289 = 272
-18 + 324 = 306
-19 + 361 = 342
-20 + 400 = 380
-21 + 441 = 420
-22 + 484 = 462
-23 + 529 = 506
-24 + 576 = 552
-25 + 625 = 600
-26 + 676 = 650
-27 + 729 = 702
-28 + 784 = 756
-29 + 841 = 812
-30 + 900 = 870
-31 + 961 = 930
-32 + 1024 = 992
-33 + 1089 = 1056
-34 + 1156 = 1122
-35 + 1225 = 1190
-36 + 1296 = 1260
-37 + 1369 = 1332
-38 + 1444 = 1406
-39 + 1521 = 1482
-40 + 1600 = 1560
-41 + 1681 = 1640
-42 + 1764 = 1722
-43 + 1849 = 1806
-44 + 1936 = 1892
-45 + 2025 = 1980
-46 + 2116 = 2070
-47 + 2209 = 2162
-48 + 2304 = 2256
-49 + 2401 = 2352
-50 + 2500 = 2450
-51 + 2601 = 2550
-52 + 2704 = 2652
-53 + 2809 = 2756
-54 + 2916 = 2862
-55 + 3025 = 2970
-56 + 3136 = 3080
-57 + 3249 = 3192
-58 + 3364 = 3306
-59 + 3481 = 3422
-60 + 3600 = 3540
-61 + 3

In [None]:
### GPU RIPPLE USING THREADS
%%cu
#include <iostream>
#include <cuda_runtime.h>
struct DataBlock {
    unsigned char *dev_bitmap;
    CPUAnimBitmap *bitmap;
};
// clean up memory allocated on the GPU
void cleanup( DataBlock *d ) {
    cudaFree( d->dev_bitmap );
}
int main( void ) {
    DataBlock data;
    CPUAnimBitmap bitmap( DIM, DIM, &data );
    data.bitmap = &bitmap;
    cudaMalloc((void**)&data.dev_bitmap, bitmap.image_size());
    bitmap.anim_and_exit((void (*) (void*, int))generate_frame, (void (*) (void*))cleanup);
}

void generate_frame( DataBlock *d, int ticks ) {
    dim3 blocks(DIM/16,DIM/16);
    dim3 threads(16,16);
    kernel<<<blocks,threads>>>( d->dev_bitmap, ticks );
    cudaMemcpy( d->bitmap->get_ptr(), d->dev_bitmap, d->bitmap->image_size(),cudaMemcpyDeviceToHost );
}

/tmp/tmpp6lbb257/f3c4a093-224f-4ccd-a0a2-12aab01dfcd1.cu(5): error: identifier "CPUAnimBitmap" is undefined
      CPUAnimBitmap *bitmap;
      ^

/tmp/tmpp6lbb257/f3c4a093-224f-4ccd-a0a2-12aab01dfcd1.cu(13): error: identifier "CPUAnimBitmap" is undefined
      CPUAnimBitmap bitmap( DIM, DIM, &data );
      ^

/tmp/tmpp6lbb257/f3c4a093-224f-4ccd-a0a2-12aab01dfcd1.cu(13): error: identifier "DIM" is undefined
      CPUAnimBitmap bitmap( DIM, DIM, &data );
                            ^

/tmp/tmpp6lbb257/f3c4a093-224f-4ccd-a0a2-12aab01dfcd1.cu(16): error: identifier "generate_frame" is undefined
      bitmap.anim_and_exit((void (*) (void*, int))generate_frame, (void (*) (void*))cleanup);
                                                  ^

/tmp/tmpp6lbb257/f3c4a093-224f-4ccd-a0a2-12aab01dfcd1.cu(20): error: identifier "DIM" is undefined
      dim3 blocks(DIM/16,DIM/16);
                  ^

/tmp/tmpp6lbb257/f3c4a093-224f-4ccd-a0a2-12aab01dfcd1.cu(22): error: identifier "kernel" is undefined