In [3]:
import torch
import time

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Matrix size and loop count
size = 3000  # large enough for heavy GPU usage
iterations = 200  # adjust if it finishes too quickly or too slowly

# Initialize random matrices
A = torch.randn(size, size, device=device)
B = torch.randn(size, size, device=device)

start_time = time.time()

for i in range(iterations):
    # Step 1: Matrix multiplication
    C = torch.matmul(A, B)
    
    # Step 2: Element-wise nonlinear operations
    C = torch.sin(C) * torch.sqrt(torch.abs(C) + 1e-6)
    
    # Step 3: Add some noise and reduce
    C = C + torch.randn(size, size, device=device) * 0.01
    row_sum = C.sum(dim=1)
    
    # Step 4: Normalize
    A = (row_sum - row_sum.mean()) / (row_sum.std() + 1e-6)
    # Reshape A for next iteration
    A = A.unsqueeze(1).expand(-1, size)

    if i % 20 == 0:
        print(f"Iteration {i} done")

end_time = time.time()
print(f"Finished {iterations} iterations in {end_time - start_time:.2f} seconds")
print("Result snippet:", A[:5, :5])


Using device: cuda
Iteration 0 done
Iteration 20 done
Iteration 40 done
Iteration 60 done
Iteration 80 done
Iteration 100 done
Iteration 120 done
Iteration 140 done
Iteration 160 done
Iteration 180 done
Finished 200 iterations in 7.72 seconds
Result snippet: tensor([[ 0.1138,  0.1138,  0.1138,  0.1138,  0.1138],
        [-0.9175, -0.9175, -0.9175, -0.9175, -0.9175],
        [-0.4750, -0.4750, -0.4750, -0.4750, -0.4750],
        [-0.7620, -0.7620, -0.7620, -0.7620, -0.7620],
        [ 0.1296,  0.1296,  0.1296,  0.1296,  0.1296]], device='cuda:0')
