In [11]:
import torch
import time

In [13]:
# Check if CUDA is available and get the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [20]:
# Get GPU properties
gpu_properties = torch.cuda.get_device_properties(device)
gpu_name = gpu_properties.name
sm_count = gpu_properties.multi_processor_count
print('gpu_properties: ',gpu_properties)
print('gpu_name: ',gpu_name)
print('sm_count: ',sm_count)
# Estimate the number of CUDA cores per SM based on the architecture
cores_per_sm = {
    "sm_70": 64,  # V100
    "sm_75": 64,  # T4, 2080Ti
    "sm_80": 64,  # A100
    "sm_86": 128,  # 3090, 3080, A40, A30
    "sm_87": 128,  # A10, A16
    "sm_89": 128,  # RTX 3060
    # Add more architectures if needed
}
arch = f"sm_{gpu_properties.major}{gpu_properties.minor}"
num_cores = sm_count * cores_per_sm.get(arch, 64)  # Default to 64 if architecture is unknown
print(f"Using GPU: {gpu_name}")
print(f"Number of streaming multiprocessors: {sm_count}")
print(f"Number of CUDA cores: {num_cores}")

gpu_properties:  _CudaDeviceProperties(name='NVIDIA GeForce RTX 3060', major=8, minor=6, total_memory=11938MB, multi_processor_count=28)
gpu_name:  NVIDIA GeForce RTX 3060
sm_count:  28
Using GPU: NVIDIA GeForce RTX 3060
Number of streaming multiprocessors: 28
Number of CUDA cores: 3584


In [21]:
# Define a large matrix size
matrix_size = 10000

# Create two large random matrices on the GPU
a = torch.randn(matrix_size, matrix_size, device=device)
b = torch.randn(matrix_size, matrix_size, device=device)

# Start the timer
start_time = time.time()

# Perform matrix multiplication
result = torch.matmul(a, b)

# Synchronize and stop the timer
torch.cuda.synchronize()
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken for matrix multiplication: {elapsed_time} seconds")

# Check if the GPU is being utilized
gpu_utilization = torch.cuda.memory_allocated(device) / torch.cuda.get_device_properties(device).total_memory * 100
print(f"GPU utilization: {gpu_utilization}%")

Time taken for matrix multiplication: 0.23913240432739258 seconds
GPU utilization: 9.599606102885742%


In [22]:
# Check if CUDA is available and get the device
device = torch.device("cpu")
# Create two large random matrices on the CPU
a = torch.randn(matrix_size, matrix_size, device=device)
b = torch.randn(matrix_size, matrix_size, device=device)
# Start the timer
start_time = time.time()

# Perform matrix multiplication
result = torch.matmul(a, b)

# Stop the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Time taken for matrix multiplication on CPU: {elapsed_time} seconds")

# Check CPU utilization (this is just an approximation)
import psutil

cpu_usage = psutil.cpu_percent(interval=1)
print(f"CPU utilization during the computation: {cpu_usage}%")

Time taken for matrix multiplication on CPU: 2.0462284088134766 seconds
CPU utilization during the computation: 2.1%
