## Checking the device

Here we check if cuda is actually available on the device.
The will check hardware and software support. \
Pytorch can also be installed without cuda.

In [21]:
# import pytorch library
import torch

# Check if cuda is actually available on the device.
# The will check hardware and software support.
# Pytorch can also be installed without cuda.
if torch.cuda.is_available():
    device = torch.device("cuda") # Use the gpu
else:
    device = torch.device("cpu") # Use the cpu
    
print("using", device, "device")

using cuda device


## Performance comparison

This is more or less a replica of the code we have used in the c++ example. It uses matrix multiplication instead of addition because the implementation of matrix addition in pytorch is way faster than what we did. As you can see it is much easier to use.

In [None]:
# import our timer
import time

# Define our workload. You can play around with the matrix size to change the execution time.
matrix_size = 32 * 512

# Allocate matrices on the host.
print("Creating random matrices: ", end="")
start = time.time()

a = torch.randn(matrix_size, matrix_size)
b = torch.randn(matrix_size, matrix_size)

print(str(time.time() - start) + " seconds")

# Measure the execution speed on the host.
print("Host speed: ", end="")
start = time.time()

c = torch.matmul(a,b)

print(str(time.time() - start) + " seconds (" + str(c.device) + ")")

# Copy the actual data to device.
print("Device Speed with overhead: ", end="")
start = time.time()

a_gpu = a.to(device)
b_gpu = b.to(device)

# The execution speed on the device with memory copying overhead.
# Note that pytorch automatically uses the cuda device when the
# used variables have been copied to the device.
c_gpu = torch.matmul(a_gpu,b_gpu)
torch.cuda.synchronize()

print(str(time.time() - start) + " seconds (" + str(c_gpu.device) + ")")

# The raw execution speed on the device.
print("Device Speed: ", end="")
start = time.time()

torch.matmul(a_gpu,b_gpu)
torch.cuda.synchronize()

print(str(time.time() - start) + " seconds (" + str(c_gpu.device) + ")")

# Done 🚀

Creating random matrices: 2.687039375305176 seconds
Host speed: 17.814164876937866 seconds (cpu)
Device Speed with overhead: 0.887509822845459 seconds (cuda:0)
Device Speed: 0.5372822284698486 seconds (cuda:0)
