<a href="https://colab.research.google.com/github/ayanmitra2021/CUDA_Practice/blob/master/Cuda_practice_notebook.ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [1]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpni3nrae3".


In [11]:
%%writefile test.cu

#include <iostream>
#include <cuda_runtime.h>

// A helper function to check for CUDA errors
void checkCudaError(cudaError_t err, const char* message) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s - %s\n", message, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

__global__ void helloFromGPU() {
    printf("Hello World from GPU thread %d!\n", threadIdx.x);
}

int main() {
    std::cout << "Hello World from CPU!" << std::endl;
    helloFromGPU<<<1, 5>>>();
    checkCudaError(cudaGetLastError(), "Kernel Launch Failed");
    checkCudaError(cudaDeviceSynchronize(), "cudaDeviceSynchronize Failed");
    std::cout << "\nSuccessfully synchronized with GPU." << std::endl;
    return 0;
}

Overwriting test.cu


In [12]:
!nvcc test.cu -o test_executable -arch=sm_75

In [14]:
!./test_executable

Hello World from CPU!
Hello World from GPU thread 0!
Hello World from GPU thread 1!
Hello World from GPU thread 2!
Hello World from GPU thread 3!
Hello World from GPU thread 4!

Successfully synchronized with GPU.


In [31]:
%%writefile test01.cu

#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>


__global__ void test01()
{
  int warp_Id_Value = 0;
  warp_Id_Value = threadIdx.x / 32;
  printf("\n The block ID is %d -- The thread ID is %d -- The warp ID  %d", blockIdx.x, threadIdx.x, warp_Id_Value);
}

int main()
{
	test01<<<2, 64>>>();
	cudaDeviceReset();
	return 0;
}

Overwriting test01.cu


In [32]:
!nvcc test01.cu -o test_executable01 -arch=sm_75

In [33]:
!./test_executable01


 The block ID is 0 -- The thread ID is 0 -- The warp ID  0
 The block ID is 0 -- The thread ID is 1 -- The warp ID  0
 The block ID is 0 -- The thread ID is 2 -- The warp ID  0
 The block ID is 0 -- The thread ID is 3 -- The warp ID  0
 The block ID is 0 -- The thread ID is 4 -- The warp ID  0
 The block ID is 0 -- The thread ID is 5 -- The warp ID  0
 The block ID is 0 -- The thread ID is 6 -- The warp ID  0
 The block ID is 0 -- The thread ID is 7 -- The warp ID  0
 The block ID is 0 -- The thread ID is 8 -- The warp ID  0
 The block ID is 0 -- The thread ID is 9 -- The warp ID  0
 The block ID is 0 -- The thread ID is 10 -- The warp ID  0
 The block ID is 0 -- The thread ID is 11 -- The warp ID  0
 The block ID is 0 -- The thread ID is 12 -- The warp ID  0
 The block ID is 0 -- The thread ID is 13 -- The warp ID  0
 The block ID is 0 -- The thread ID is 14 -- The warp ID  0
 The block ID is 0 -- The thread ID is 15 -- The warp ID  0
 The block ID is 0 -- The thread ID is 16 -- The 

In [39]:
%%writefile test02_vector_add.cu
//This program adds two vectors and stores the value in a third vector.
//The vector addition happens on the GPU, while initilization happens on the CPU.

#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

#define SIZE 1024 //define the size of the vector

__global__ void vector_add(int *A, int *B, int *C, int n)
{
  int i = threadIdx.x;
  C[i] = A[i] + B[i];
}

int main()
{
	int *A, *B, *C; //host vectors
  int *d_A, *d_B, *d_C; //device vectors

  int size = SIZE * sizeof(int);

  //Allocate memory for host
  A = (int *)malloc(size);
  B = (int *)malloc(size);
  C = (int *)malloc(size);

  //Allocate memory for device
  cudaMalloc((void **)&d_A, size);
  cudaMalloc((void **)&d_B, size);
  cudaMalloc((void **)&d_C, size);

  //Initilize the vectors on the CPU
  for(int i = 0; i < SIZE; i++)
  {
    A[i] = i;
    B[i] = i;
  }

  //Copy the vectors from the host to the device
  cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

  vector_add<<<1, SIZE>>>(d_A, d_B, d_C, SIZE);

  //Copy the result from the device to the host
  cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

  //print the values of the resultant vector
  for(int i = 0; i < SIZE; i++)
  {
    printf("%d + %d = %d\n", A[i], B[i], C[i]);
  }

  return 0;
}

Overwriting test02_vector_add.cu


In [40]:
!nvcc test02_vector_add.cu -o test02_vector_add -arch=sm_75

In [41]:
!./test02_vector_add

0 + 0 = 0
1 + 1 = 2
2 + 2 = 4
3 + 3 = 6
4 + 4 = 8
5 + 5 = 10
6 + 6 = 12
7 + 7 = 14
8 + 8 = 16
9 + 9 = 18
10 + 10 = 20
11 + 11 = 22
12 + 12 = 24
13 + 13 = 26
14 + 14 = 28
15 + 15 = 30
16 + 16 = 32
17 + 17 = 34
18 + 18 = 36
19 + 19 = 38
20 + 20 = 40
21 + 21 = 42
22 + 22 = 44
23 + 23 = 46
24 + 24 = 48
25 + 25 = 50
26 + 26 = 52
27 + 27 = 54
28 + 28 = 56
29 + 29 = 58
30 + 30 = 60
31 + 31 = 62
32 + 32 = 64
33 + 33 = 66
34 + 34 = 68
35 + 35 = 70
36 + 36 = 72
37 + 37 = 74
38 + 38 = 76
39 + 39 = 78
40 + 40 = 80
41 + 41 = 82
42 + 42 = 84
43 + 43 = 86
44 + 44 = 88
45 + 45 = 90
46 + 46 = 92
47 + 47 = 94
48 + 48 = 96
49 + 49 = 98
50 + 50 = 100
51 + 51 = 102
52 + 52 = 104
53 + 53 = 106
54 + 54 = 108
55 + 55 = 110
56 + 56 = 112
57 + 57 = 114
58 + 58 = 116
59 + 59 = 118
60 + 60 = 120
61 + 61 = 122
62 + 62 = 124
63 + 63 = 126
64 + 64 = 128
65 + 65 = 130
66 + 66 = 132
67 + 67 = 134
68 + 68 = 136
69 + 69 = 138
70 + 70 = 140
71 + 71 = 142
72 + 72 = 144
73 + 73 = 146
74 + 74 = 148
75 + 75 = 150
76 + 76 = 1