In [4]:
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <stdlib.h>
int main()
{
    	int deviceCount;
    	cudaGetDeviceCount(&deviceCount);
    	if (deviceCount == 0)
	{
        	printf("There is no device supporting CUDA\n");
	}
    	int dev;
    	for (dev = 0; dev < deviceCount; ++dev)
	{
        	cudaDeviceProp deviceProp;
        	cudaGetDeviceProperties(&deviceProp, dev);
        	if (dev == 0)
		{
            		if (deviceProp.major < 1)
                	{
				printf("There is no device supporting CUDA.\n");
			}
            		else if (deviceCount == 1)
			{
                		printf("There is 1 device supporting CUDA\n");
			}
            		else
                	{
				printf("There are %d devices supporting CUDA\n", deviceCount);
			}
        	}
        	printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        	printf("  Major revision number:                         %d\n", deviceProp.major);
        	printf("  Minor revision number:                         %d\n", deviceProp.minor);
        	printf("  Total amount of global memory:                 %zu bytes\n", deviceProp.totalGlobalMem);
        	printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
        	printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
        	printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        	printf("  Warp size:                                     %d\n", deviceProp.warpSize);
		printf("  Multiprocessor count:                          %d\n",deviceProp.multiProcessorCount );

        	printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        	printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        	printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],  deviceProp.maxGridSize[2]);
        	printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
        	printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
        	printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    	}
}


Overwriting cuda_device_info.cu


In [5]:
!nvcc cuda_device_info.cu -o cuda_device_info

In [6]:
!./cuda_device_info

There is 1 device supporting CUDA

Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15835660288 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


In [7]:
%%writefile helloFromThread.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void helloFromThreads() {
    int threadId = threadIdx.x;  // Get the thread ID within the block
    printf("Hello World from Thread ID: %d\n", threadId);
}

int main() {
    int numThreads = 10;  // Number of threads per block

    // Launch the kernel with 1 block and numThreads threads
    helloFromThreads<<<1, numThreads>>>();

    // Synchronize to wait for all threads to finish
    cudaDeviceSynchronize();

    return 0;
}

Writing helloFromThread.cu


In [8]:
!nvcc helloFromThread.cu -o helloFromThread

In [10]:
!./helloFromThread

Hello World from Thread ID: 0
Hello World from Thread ID: 1
Hello World from Thread ID: 2
Hello World from Thread ID: 3
Hello World from Thread ID: 4
Hello World from Thread ID: 5
Hello World from Thread ID: 6
Hello World from Thread ID: 7
Hello World from Thread ID: 8
Hello World from Thread ID: 9


In [11]:
%%writefile global.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void helloFromThreads() {
    // Calculate global thread ID (threadIdx.x + blockIdx.x * blockDim.x)
    int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Hello World from Global Thread ID: %d\n", globalThreadId);
}

int main() {
    int numBlocks = 2;      // Number of blocks
    int numThreads = 5;     // Number of threads per block

    // Launch the kernel with numBlocks and numThreads per block
    helloFromThreads<<<numBlocks, numThreads>>>();

    // Synchronize to wait for all threads to finish
    cudaDeviceSynchronize();

    return 0;
}


Writing global.cu


In [12]:
!nvcc global.cu -o global

In [13]:
!./global

Hello World from Global Thread ID: 0
Hello World from Global Thread ID: 1
Hello World from Global Thread ID: 2
Hello World from Global Thread ID: 3
Hello World from Global Thread ID: 4
Hello World from Global Thread ID: 5
Hello World from Global Thread ID: 6
Hello World from Global Thread ID: 7
Hello World from Global Thread ID: 8
Hello World from Global Thread ID: 9


In [14]:
%%writefile 2dthread.cu
#include <iostream>
#include <cuda_runtime.h>

__global__ void helloFrom2DThreads() {
    // Get the 2D block index and thread index
    int blockX = blockIdx.x;
    int blockY = blockIdx.y;
    int threadX = threadIdx.x;
    int threadY = threadIdx.y;

    // Print the block and thread IDs in 2D
    printf("Hello World from Block(%d, %d) Thread(%d, %d)\n", blockX, blockY, threadX, threadY);
}

int main() {
    // Define 2D block and thread dimensions
    dim3 numBlocks(2, 2);      // 2x2 blocks
    dim3 numThreads(3, 3);     // 3x3 threads per block

    // Launch the kernel with 2D blocks and 2D threads
    helloFrom2DThreads<<<numBlocks, numThreads>>>();

    // Synchronize to wait for all threads to finish
    cudaDeviceSynchronize();

    return 0;
}

Writing 2dthread.cu


In [15]:
!nvcc 2dthread.cu -o 2dthread

In [16]:
!./2dthread

Hello World from Block(0, 1) Thread(0, 0)
Hello World from Block(0, 1) Thread(1, 0)
Hello World from Block(0, 1) Thread(2, 0)
Hello World from Block(0, 1) Thread(0, 1)
Hello World from Block(0, 1) Thread(1, 1)
Hello World from Block(0, 1) Thread(2, 1)
Hello World from Block(0, 1) Thread(0, 2)
Hello World from Block(0, 1) Thread(1, 2)
Hello World from Block(0, 1) Thread(2, 2)
Hello World from Block(0, 0) Thread(0, 0)
Hello World from Block(0, 0) Thread(1, 0)
Hello World from Block(0, 0) Thread(2, 0)
Hello World from Block(0, 0) Thread(0, 1)
Hello World from Block(0, 0) Thread(1, 1)
Hello World from Block(0, 0) Thread(2, 1)
Hello World from Block(0, 0) Thread(0, 2)
Hello World from Block(0, 0) Thread(1, 2)
Hello World from Block(0, 0) Thread(2, 2)
Hello World from Block(1, 1) Thread(0, 0)
Hello World from Block(1, 1) Thread(1, 0)
Hello World from Block(1, 1) Thread(2, 0)
Hello World from Block(1, 1) Thread(0, 1)
Hello World from Block(1, 1) Thread(1, 1)
Hello World from Block(1, 1) Threa