# Function definitions

In [None]:
def matrix_multiply(A, B):
    n = len(A)
    p = len(B[0])
    m = len(B)
    result = [[0] * p for _ in range(n)]
    for i in range(n):
        for j in range(p):
            for k in range(m):
                result[i][j] += A[i][k] * B[k][j]
    return result


In [None]:
def matrix_multiply_np(A, B,Reps):
    C=np.dot(A, B)
    for i in range(Reps):
        C=np.dot(A, C)
    return C    


In [None]:
def matrix_multiply_torch(A, B, Reps):
    # Check if CUDA (GPU support) is available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    A_torch = torch.tensor(A, dtype=torch.float64).to(device)
    B_torch = torch.tensor(B, dtype=torch.float64).to(device)
    C_torch = torch.matmul(A_torch, B_torch)  # Perform the first multiplication on the GPU

    for i in range(Reps):  # Subtract 1 because the first multiplication is already done
        C_torch = torch.matmul(A_torch, C_torch)  # Further multiplications on the GPU

    C = C_torch.cpu().numpy()  # Only move the final result to CPU and convert to NumPy
    return C


# Benchmarking functions

In [None]:
import time
import numpy as np
import torch

# Generate random matrices
n = 512
Reps=20
A = np.random.rand(n, n).astype(np.float64)
B = np.random.rand(n, n).astype(np.float64)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device is:",device)

#Measure performance of plain Python
start_time = time.time()
result_python = matrix_multiply(A.tolist(), B.tolist())
print("Plain Python time:", time.time() - start_time)

# Measure performance of NumPy
start_time = time.time()
result_np = matrix_multiply_np(A, B,Reps)
print("NumPy time:", time.time() - start_time)

# Measure performance of PyTorch with GPU
start_time = time.time()
result_torch = matrix_multiply_torch(A, B,Reps)
print("PyTorch (GPU) time:", time.time() - start_time)



# Using timeit to properly benchmark functions

In [None]:
import matplotlib
matplotlib.use('Agg')  # Use the 'Agg' backend for file creation
import torch
import timeit
import numpy as np
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device is:",device)

n = 512
A = np.random.rand(n, n).astype(np.float64)
B = np.random.rand(n, n).astype(np.float64)
reps_range = range(1, 61, 10)
numpy_times = []
torch_times = []
number = 10

for Reps in reps_range:
    # Time NumPy
    numpy_time = timeit.timeit('matrix_multiply_np(A, B, Reps)', globals=globals(), number=number)/number
    print('numpy ',Reps, numpy_time)
    numpy_times.append(numpy_time)

    # Time PyTorch
    torch_time = timeit.timeit('matrix_multiply_torch(A, B,  Reps)', globals=globals(), number=number)/number
    print('Torch ',Reps, torch_time)
    torch_times.append(torch_time)

plt.figure(figsize=(10, 5))
plt.plot(reps_range, numpy_times, label='NumPy', marker='o')
plt.plot(reps_range, torch_times, label='PyTorch', marker='x')
plt.xlabel('Number of Repetitions (Reps)')
plt.ylabel('Time (seconds)')
plt.title('Performance Comparison: NumPy vs. PyTorch GPU')
plt.legend()
plt.grid(True)
plt.draw()
plt.savefig('reps_docker.pdf')


# Using more GPU intensive calculations

In [None]:
import numpy as np
import torch
import time
import matplotlib.pyplot as plt

def multiply_numpy(A, B):
    return np.dot(A, B)

def multiply_torch(A, B):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    A_torch = torch.tensor(A, dtype=torch.float64).to(device)
    B_torch = torch.tensor(B, dtype=torch.float64).to(device)
    return torch.matmul(A_torch, B_torch).cpu().numpy()

# Sizes of the matrices to test
sizes = [256, 512, 2**10, 2**11, 2**12]
numpy_times = []
torch_times = []


for size in sizes:
    A = np.random.rand(size, size).astype(np.float64)
    B = np.random.rand(size, size).astype(np.float64)

    # Time NumPy
    start_time = time.time()
    Cnp=multiply_numpy(A, B)
    numpy_duration = time.time() - start_time
    numpy_times.append(numpy_duration)

    # Time PyTorch with GPU
    start_time = time.time()
    Ctorch=multiply_torch(A, B)
    torch_duration = time.time() - start_time
    torch_times.append(torch_duration)

    print(f"Size: {size}x{size}")
    print("NumPy Duration:", numpy_duration)
    print("Torch GPU Duration:", torch_duration)
    print("Difference:",np.max(np.abs(Cnp-Ctorch)),'\n')    
    

# Plotting the results
plt.figure(figsize=(10, 5))
plt.plot(sizes, numpy_times, label='NumPy', marker='o')
plt.plot(sizes, torch_times, label='PyTorch GPU', marker='x')
plt.xlabel('Matrix Size')
plt.ylabel('Time (seconds)')
plt.title('Performance Comparison: NumPy vs. PyTorch GPU')
plt.yscale('log')  # Set the y-axis to logarithmic scale
plt.legend()
plt.grid(True, which="both", ls="--")  # Enable grid for major and minor ticks
plt.draw()
plt.savefig('sizes_docker.pdf')


# Different GPU optimized functions

In [None]:
import numpy as np
import torch
from PIL import Image
import timeit

# Example image loading and conversion to numpy
image = Image.open('Kinkade.jpg').convert('RGB')  # Ensure image is in RGB
image_np = np.array(image).astype(np.float32)  # Convert to float for better handling in PyTorch
image_torch = torch.tensor(image_np.transpose(2, 0, 1)).unsqueeze(0).to('cuda')  # Convert to Tensor and move to GPU

# Define the target size as a tuple (width, height)
size = (512*15, 512*15)  # Define the size as a tuple

def numpy_resize(image_np, size):
    return np.array(Image.fromarray(image_np.astype(np.uint8)).resize(size))

def torch_resize(image_torch, size):
    # size needs to be (height, width) for PyTorch, reverse the tuple
    torch_size = (size[1], size[0])
    return torch.nn.functional.interpolate(image_torch, size=torch_size, mode='bilinear', align_corners=False).cpu()

# Timing the functions using timeit
number = 10  # Number of iterations for timing

# Time NumPy
numpy_time = timeit.timeit('numpy_resize(image_np, size)', globals=globals(), number=number) / number

# Time PyTorch
torch_time = timeit.timeit('torch_resize(image_torch, size)', globals=globals(), number=number) / number

print("NumPy Duration:", numpy_time)
print("Torch GPU Duration:", torch_time)


# Functions not optimized for GPU

In [None]:
# Creating a large array and summing it
import timeit
import numpy as np
import torch
large_data = np.random.rand(10**8)  # 10 million elements


def sum_numpy(data):
    return np.sum(data)

def sum_torch(data):
    tensor = torch.tensor(data).cuda()
    return torch.sum(tensor).cpu()

number=10
# Timing the sum operations
numpy_time = timeit.timeit('sum_numpy(large_data)', globals=globals(), number=number) / number


torch_time = timeit.timeit('sum_torch(large_data)', globals=globals(), number=number) / number


print("NumPy time:", numpy_time)
print("Torch time:", torch_time)
