In [None]:
# This cell might take a while
#!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytools>=2011.2
  Downloading pytools-2022.1.14.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (pyproject.

Check if we have a cuda device: **drv.Device.count()**

In [None]:
import pycuda
import pycuda.driver as drv
drv.init()

print(f'Detected {drv.Device.count()} CUDA Capable device(s)')


Detected 1 CUDA Capable device(s)


Print out some of the properties of the device.

In [None]:
i=0
gpu_device = drv.Device(i)

# print out the name of the device
print(f'Device {i}: {gpu_device.name()}')



Device 0: Tesla T4


In [None]:
# print out the name out the compute capability of the device
# This will determine how many GPU cores we have. 
# See below for the details.

compute_capability = float( '%d.%d' % gpu_device.compute_capability() )

print(f'\t Compute Capability: {compute_capability}')


	 Compute Capability: 7.5


In [None]:
# memory of the device in megabytes:
print(f'\t Total Memory: {gpu_device.total_memory()//(1024**2)} megabytes')

	 Total Memory: 15109 megabytes


In [None]:
# print out all the other device attributes and their values
device_attributes_tuples = tuple(gpu_device.get_attributes().items())
device_attributes = {}
    
for k, v in device_attributes_tuples:
        device_attributes[str(k)] = v

device_attributes

{'ASYNC_ENGINE_COUNT': 3,
 'CAN_MAP_HOST_MEMORY': 1,
 'CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM': 1,
 'CLOCK_RATE': 1590000,
 'COMPUTE_CAPABILITY_MAJOR': 7,
 'COMPUTE_CAPABILITY_MINOR': 5,
 'COMPUTE_MODE': pycuda._driver.compute_mode.DEFAULT,
 'COMPUTE_PREEMPTION_SUPPORTED': 1,
 'CONCURRENT_KERNELS': 1,
 'CONCURRENT_MANAGED_ACCESS': 1,
 'DIRECT_MANAGED_MEM_ACCESS_FROM_HOST': 0,
 'ECC_ENABLED': 1,
 'GENERIC_COMPRESSION_SUPPORTED': 0,
 'GLOBAL_L1_CACHE_SUPPORTED': 1,
 'GLOBAL_MEMORY_BUS_WIDTH': 256,
 'GPU_OVERLAP': 1,
 'HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED': 1,
 'HANDLE_TYPE_WIN32_HANDLE_SUPPORTED': 0,
 'HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED': 0,
 'HOST_NATIVE_ATOMIC_SUPPORTED': 0,
 'INTEGRATED': 0,
 'KERNEL_EXEC_TIMEOUT': 0,
 'L2_CACHE_SIZE': 4194304,
 'LOCAL_L1_CACHE_SUPPORTED': 1,
 'MANAGED_MEMORY': 1,
 'MAXIMUM_SURFACE1D_LAYERED_LAYERS': 2048,
 'MAXIMUM_SURFACE1D_LAYERED_WIDTH': 32768,
 'MAXIMUM_SURFACE1D_WIDTH': 32768,
 'MAXIMUM_SURFACE2D_HEIGHT': 65536,
 'MAXIMUM_SURFACE2D_L

In [None]:
# Cores per multiprocessor is not reported by the GPU!  
# We must use a lookup table based on compute capability.
# See the following:
# http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
    
print(f'compute_capability: {compute_capability}')

cuda_cores_per_mp = {3.7:32, 5.0 : 128, 5.1 : 128, 5.2 : 128, 6.0 : 64, 6.1 : 128, 6.2 : 128, 7.5: 64}[compute_capability]
print(f'CUDA Cores / Multiprocessor: {cuda_cores_per_mp}')

num_mp = device_attributes['MULTIPROCESSOR_COUNT']
print(f'number of multiprocessors: {num_mp}')

print(f'CUDA Cores: {num_mp*cuda_cores_per_mp} ')
    


compute_capability: 7.5
CUDA Cores / Multiprocessor: 64
number of multiprocessors: 40
CUDA Cores: 2560 


Let us print all these for all GPUs 

In [None]:
for i in range(drv.Device.count()):
    
    gpu_device = drv.Device(i)
    print('Device {}: {}'.format( i, gpu_device.name() ) )
    compute_capability = float( '%d.%d' % gpu_device.compute_capability() )
    print('\t Compute Capability: {}'.format(compute_capability))
    print('\t Total Memory: {} megabytes'.format(gpu_device.total_memory()//(1024**2)))
    
    # The following will give us all remaining device attributes as seen 
    # in the original deviceQuery.
    # We set up a dictionary as such so that we can easily index
    # the values using a string descriptor.
    
    device_attributes_tuples = gpu_device.get_attributes().items() 
    device_attributes = {}
    
    for k, v in device_attributes_tuples:
        device_attributes[str(k)] = v
    
    for k in device_attributes.keys():
        print('\t {}: {}'.format(k, device_attributes[k]))

Device 0: Tesla T4
	 Compute Capability: 7.5
	 Total Memory: 15109 megabytes
	 ASYNC_ENGINE_COUNT: 3
	 CAN_MAP_HOST_MEMORY: 1
	 CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: 1
	 CLOCK_RATE: 1590000
	 COMPUTE_CAPABILITY_MAJOR: 7
	 COMPUTE_CAPABILITY_MINOR: 5
	 COMPUTE_MODE: DEFAULT
	 COMPUTE_PREEMPTION_SUPPORTED: 1
	 CONCURRENT_KERNELS: 1
	 CONCURRENT_MANAGED_ACCESS: 1
	 DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: 0
	 ECC_ENABLED: 1
	 GENERIC_COMPRESSION_SUPPORTED: 0
	 GLOBAL_L1_CACHE_SUPPORTED: 1
	 GLOBAL_MEMORY_BUS_WIDTH: 256
	 GPU_OVERLAP: 1
	 HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: 1
	 HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: 0
	 HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: 0
	 HOST_NATIVE_ATOMIC_SUPPORTED: 0
	 INTEGRATED: 0
	 KERNEL_EXEC_TIMEOUT: 0
	 L2_CACHE_SIZE: 4194304
	 LOCAL_L1_CACHE_SUPPORTED: 1
	 MANAGED_MEMORY: 1
	 MAXIMUM_SURFACE1D_LAYERED_LAYERS: 2048
	 MAXIMUM_SURFACE1D_LAYERED_WIDTH: 32768
	 MAXIMUM_SURFACE1D_WIDTH: 32768
	 MAXIMUM_SURFACE2D_HEIGHT: 65536
	 MAXIMUM_SURFACE2D_LAYERED_HE