In [1]:
import numpy as np
import mlx.core as mx
# import mlx.core
import platform
import psutil
import time
import os

In [2]:
def test_matrix_size(size, use_gpu=True):
    """
    Tests if a given matrix size works with MLX's QR implementation.
    
    This function creates a matrix of the specified size and attempts a QR
    factorization. It helps us find the largest matrix size that works reliably
    on the current hardware.
    
    Parameters:
        size: The size of the square matrix to test
        use_gpu: Whether to use GPU acceleration
    
    Returns:
        bool: True if the operation succeeds, False if it fails
    """
    try:
        # Create a test matrix
        A = np.random.randn(size, size).astype(np.float32)
        A = mx.array(A)
        
        # Attempt QR factorization
        Q, R = mx.linalg.qr(A)
        _ = mx.eval((Q, R))
        
        return True
    except Exception as e:
        print(f"Failed at size {size}x{size}: {str(e)}")
        return False

In [3]:
def find_safe_matrix_size(max_size=10000, step=500, use_gpu=True):
    """
    Finds the largest matrix size that works reliably with MLX QR.
    
    Uses binary search to efficiently find the maximum working size.
    This helps us set appropriate benchmarking parameters for the
    current hardware.
    
    Parameters:
        max_size: Maximum size to try
        step: Size increment for initial testing
        use_gpu: Whether to use GPU acceleration
    
    Returns:
        int: The largest working matrix size found
    """
    if use_gpu:
        mx.set_default_device(mx.gpu)
    else:
        mx.set_default_device(mx.cpu)
    
    print("Finding maximum safe matrix size...")
    
    # Binary search for the maximum working size
    left, right = step, max_size
    safe_size = step
    
    while left <= right:
        mid = (left + right) // 2
        mid = (mid // step) * step  # Round to nearest step
        
        print(f"Testing size {mid}x{mid}...")
        if test_matrix_size(mid, use_gpu):
            safe_size = mid
            left = mid + step
        else:
            right = mid - step
    
    print(f"Largest safe matrix size found: {safe_size}x{safe_size}")
    return safe_size

In [4]:
def benchmark_qr_operations(matrix_size=None, iterations=5, use_gpu=False):
    """
    Benchmark QR factorization using either NumPy or MLX.
    
    If no matrix_size is provided, it automatically finds a safe size.
    The benchmark includes careful error handling and fallback options.
    """
    xp, linalg, backend_name = (mx, mx.linalg, "MLX with Metal GPU") if use_gpu else (np, np.linalg, "NumPy")
    
    if use_gpu:
        mx.set_default_device(mx.gpu)
        if matrix_size is None:
            matrix_size = find_safe_matrix_size(use_gpu=True)
    else:
        if matrix_size is None:
            matrix_size = 5000  # Default size for CPU
    
    print(f"\nSystem Information:")
    print(f"OS: {platform.system()} {platform.version()}")
    print(f"CPU: {platform.processor()}")
    print(f"Memory: {psutil.virtual_memory().total / (1024**3):.1f} GB")
    print(f"Backend: {backend_name}")
    print(f"Matrix size: {matrix_size}x{matrix_size}")
    print(f"Iterations: {iterations}")
    
    times = []
    
    try:
        # Warmup iteration
        A_warmup = np.random.randn(matrix_size, matrix_size).astype(np.float32)
        if use_gpu:
            A_warmup = mx.array(A_warmup)
            Q, R = mx.linalg.qr(A_warmup)
            _ = mx.eval((Q, R))
        else:
            _ = np.linalg.qr(A_warmup)
        
        for i in range(iterations):
            A = np.random.randn(matrix_size, matrix_size).astype(np.float32)
            
            start_time = time.perf_counter()
            
            if use_gpu:
                A = mx.array(A)
                Q, R = mx.linalg.qr(A)
                Q, R = mx.eval((Q, R))
            else:
                Q, R = np.linalg.qr(A)
            
            end_time = time.perf_counter()
            
            operation_time = end_time - start_time
            times.append(operation_time)
            
            print(f"Iteration {i+1}/{iterations}: {operation_time:.2f} seconds")
            
    except Exception as e:
        print(f"Error during benchmark: {str(e)}")
        if use_gpu:
            print("Consider using a smaller matrix size or running on CPU")
        return None, None
    
    return sum(times) / len(times), times

In [None]:
def run_full_benchmark():
    """
    Run benchmarks with both CPU and GPU backends with automatic size selection.
    """
    print("\nRunning CPU benchmark (NumPy)...")
    cpu_avg, cpu_times = benchmark_qr_operations(use_gpu=False)
    
    if cpu_avg is not None:
        print("\nRunning GPU benchmark (MLX)...")
        gpu_avg, gpu_times = benchmark_qr_operations(use_gpu=True)
        
        if gpu_avg is not None:
            print("\nBenchmark Results:")
            print(f"CPU Average Time: {cpu_avg:.2f} seconds")
            print(f"GPU Average Time: {gpu_avg:.2f} seconds")
            print(f"Speedup: {cpu_avg/gpu_avg:.2f}x")
            
            print("\nDetailed Statistics:")
            print("\nCPU Times:")
            print(f"Min: {min(cpu_times):.2f}s")
            print(f"Max: {max(cpu_times):.2f}s")
            print(f"Std Dev: {np.std(cpu_times):.2f}s")
            
            print("\nGPU Times:")
            print(f"Min: {min(gpu_times):.2f}s")
            print(f"Max: {max(gpu_times):.2f}s")
            print(f"Std Dev: {np.std(gpu_times):.2f}s")

if __name__ == "__main__":
    run_full_benchmark()


Running CPU benchmark (NumPy)...

System Information:
OS: Darwin Darwin Kernel Version 23.4.0: Fri Mar 15 00:12:49 PDT 2024; root:xnu-10063.101.17~1/RELEASE_ARM64_T6020
CPU: arm
Memory: 96.0 GB
Backend: NumPy
Matrix size: 5000x5000
Iterations: 5
Iteration 1/5: 7.85 seconds
Iteration 2/5: 8.13 seconds
Iteration 3/5: 9.58 seconds
Iteration 4/5: 8.65 seconds
Iteration 5/5: 8.37 seconds

Running GPU benchmark (MLX)...
Finding maximum safe matrix size...
Testing size 5000x5000...
