# GPU Performance Testing and Optimization

This notebook provides tools for testing GPU performance and optimizing workloads in the research environment.

## 1. Setup and Environment Check

First, let's check if a GPU is available and import required libraries.

In [None]:
import sys
import os
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Import custom utilities
sys.path.append('..')
from utils.gpu_utils import gpu_manager

# Check GPU availability
gpu_available = gpu_manager.check_gpu_availability()
print(f"GPU available: {gpu_available}")

# Get GPU information if available
if gpu_available:
    gpu_info = gpu_manager.get_gpu_info()
    
    print(f"GPU Count: {len(gpu_info)}")
    for idx, gpu in enumerate(gpu_info):
        print(f"GPU {idx}: {gpu.get('name', 'Unknown')}")
        print(f"  Memory: {gpu.get('memory_total_mb', 0)} MB")
else:
    print("No GPU detected. Some tests will be skipped.")

## 2. PyTorch GPU Performance Testing

Let's test PyTorch matrix operations on GPU vs CPU.

In [None]:
try:
    import torch
    TORCH_AVAILABLE = True
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU device: {torch.cuda.get_device_name(0)}")
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not installed. Skipping PyTorch tests.")

In [None]:
def test_pytorch_performance(sizes=[1000, 2000, 4000, 8000]):
    if not TORCH_AVAILABLE or not torch.cuda.is_available():
        print("PyTorch or CUDA not available. Skipping test.")
        return
    
    # Results storage
    results = {
        'sizes': sizes,
        'cpu_times': [],
        'gpu_times': [],
        'speedups': []
    }
    
    for size in sizes:
        print(f"Testing matrix multiplication with size {size}x{size}...")
        
        # Create random matrices
        a_cpu = torch.rand(size, size)
        b_cpu = torch.rand(size, size)
        
        # CPU test
        start = time.time()
        c_cpu = torch.matmul(a_cpu, b_cpu)
        cpu_time = time.time() - start
        results['cpu_times'].append(cpu_time)
        print(f"  CPU time: {cpu_time:.4f} seconds")
        
        # GPU test
        a_gpu = a_cpu.cuda()
        b_gpu = b_cpu.cuda()
        
        # Warm-up run
        c_gpu = torch.matmul(a_gpu, b_gpu)
        torch.cuda.synchronize()
        
        # Timed run
        start = time.time()
        c_gpu = torch.matmul(a_gpu, b_gpu)
        torch.cuda.synchronize()
        gpu_time = time.time() - start
        results['gpu_times'].append(gpu_time)
        print(f"  GPU time: {gpu_time:.4f} seconds")
        
        # Calculate speedup
        speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
        results['speedups'].append(speedup)
        print(f"  Speedup: {speedup:.2f}x")
        
        # Clear GPU memory
        del a_gpu, b_gpu, c_gpu
        torch.cuda.empty_cache()
    
    # Plot results
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(sizes, results['cpu_times'], 'b-o', label='CPU')
    plt.plot(sizes, results['gpu_times'], 'r-o', label='GPU')
    plt.xlabel('Matrix Size')
    plt.ylabel('Time (seconds)')
    plt.title('Matrix Multiplication Performance')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(sizes, results['speedups'], 'g-o')
    plt.xlabel('Matrix Size')
    plt.ylabel('Speedup (CPU time / GPU time)')
    plt.title('GPU Speedup Factor')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    return results

# Run the test with smaller matrices first
# Uncomment to run the test
# results = test_pytorch_performance(sizes=[1000, 2000, 4000])

## 3. TensorFlow GPU Performance Testing

In [None]:
try:
    import tensorflow as tf
    TF_AVAILABLE = True
    print(f"TensorFlow version: {tf.__version__}")
    print(f"GPU devices: {tf.config.list_physical_devices('GPU')}")
except ImportError:
    TF_AVAILABLE = False
    print("TensorFlow not installed. Skipping TensorFlow tests.")

In [None]:
def test_tensorflow_performance(sizes=[1000, 2000, 4000]):
    if not TF_AVAILABLE:
        print("TensorFlow not available. Skipping test.")
        return
    
    # Check for GPU availability in TensorFlow
    gpus = tf.config.list_physical_devices('GPU')
    if not gpus:
        print("No GPU available for TensorFlow. Skipping test.")
        return
    
    # Results storage
    results = {
        'sizes': sizes,
        'cpu_times': [],
        'gpu_times': [],
        'speedups': []
    }
    
    # Set memory growth to avoid OOM errors
    for gpu in gpus:
        try:
            tf.config.experimental.set_memory_growth(gpu, True)
        except:
            pass
    
    for size in sizes:
        print(f"Testing TensorFlow matrix multiplication with size {size}x{size}...")
        
        # CPU test
        with tf.device('/cpu:0'):
            a_cpu = tf.random.normal([size, size])
            b_cpu = tf.random.normal([size, size])
            
            # Warm-up run
            c_cpu = tf.matmul(a_cpu, b_cpu)
            
            # Timed run
            start = time.time()
            c_cpu = tf.matmul(a_cpu, b_cpu)
            cpu_time = time.time() - start
            results['cpu_times'].append(cpu_time)
            print(f"  CPU time: {cpu_time:.4f} seconds")
        
        # GPU test
        with tf.device('/gpu:0'):
            a_gpu = tf.random.normal([size, size])
            b_gpu = tf.random.normal([size, size])
            
            # Warm-up run
            c_gpu = tf.matmul(a_gpu, b_gpu)
            
            # Timed run
            start = time.time()
            c_gpu = tf.matmul(a_gpu, b_gpu)
            gpu_time = time.time() - start
            results['gpu_times'].append(gpu_time)
            print(f"  GPU time: {gpu_time:.4f} seconds")
        
        # Calculate speedup
        speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
        results['speedups'].append(speedup)
        print(f"  Speedup: {speedup:.2f}x")
    
    # Plot results
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(sizes, results['cpu_times'], 'b-o', label='CPU')
    plt.plot(sizes, results['gpu_times'], 'r-o', label='GPU')
    plt.xlabel('Matrix Size')
    plt.ylabel('Time (seconds)')
    plt.title('TensorFlow Matrix Multiplication Performance')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(sizes, results['speedups'], 'g-o')
    plt.xlabel('Matrix Size')
    plt.ylabel('Speedup (CPU time / GPU time)')
    plt.title('GPU Speedup Factor')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    return results

# Uncomment to run the test
# results_tf = test_tensorflow_performance(sizes=[1000, 2000, 4000])

## 4. Memory Usage Optimization Tests

Test different memory optimization strategies.

In [None]:
def test_memory_optimization():
    if not TORCH_AVAILABLE or not torch.cuda.is_available():
        print("PyTorch or CUDA not available. Skipping test.")
        return
    
    print("\nTesting GPU memory usage optimization strategies...")
    
    # Base memory usage
    torch.cuda.empty_cache()
    base_memory = torch.cuda.memory_allocated() / 1024 / 1024
    print(f"Base GPU memory usage: {base_memory:.2f} MB")
    
    # Test 1: Standard vs Mixed Precision
    print("\nTest 1: Standard vs Mixed Precision")
    size = 5000
    
    # Standard precision (float32)
    torch.cuda.empty_cache()
    start_memory = torch.cuda.memory_allocated() / 1024 / 1024
    start_time = time.time()
    
    a = torch.rand(size, size, device='cuda')
    b = torch.rand(size, size, device='cuda')
    c = torch.matmul(a, b)
    torch.cuda.synchronize()
    
    end_time = time.time()
    end_memory = torch.cuda.memory_allocated() / 1024 / 1024
    
    fp32_time = end_time - start_time
    fp32_memory = end_memory - start_memory
    
    print(f"FP32: Time = {fp32_time:.4f}s, Memory = {fp32_memory:.2f} MB")
    
    # Free memory
    del a, b, c
    torch.cuda.empty_cache()
    
    # Mixed precision (float16)
    torch.cuda.empty_cache()
    start_memory = torch.cuda.memory_allocated() / 1024 / 1024
    start_time = time.time()
    
    a = torch.rand(size, size, device='cuda', dtype=torch.float16)
    b = torch.rand(size, size, device='cuda', dtype=torch.float16)
    c = torch.matmul(a, b)
    torch.cuda.synchronize()
    
    end_time = time.time()
    end_memory = torch.cuda.memory_allocated() / 1024 / 1024
    
    fp16_time = end_time - start_time
    fp16_memory = end_memory - start_memory
    
    print(f"FP16: Time = {fp16_time:.4f}s, Memory = {fp16_memory:.2f} MB")
    print(f"Speedup: {fp32_time / fp16_time:.2f}x, Memory saving: {(1 - fp16_memory / fp32_memory) * 100:.1f}%")
    
    # Free memory
    del a, b, c
    torch.cuda.empty_cache()
    
    # Test 2: Gradient accumulation simulation
    print("\nTest 2: Gradient Accumulation Simulation")
    
    batch_size = 128
    model_size = 1000
    accumulation_steps = 4
    
    # Standard approach (large batch)
    torch.cuda.empty_cache()
    start_memory = torch.cuda.memory_allocated() / 1024 / 1024
    
    # Simulate a large batch
    big_batch = torch.rand(batch_size * accumulation_steps, model_size, device='cuda')
    weights = torch.rand(model_size, model_size, device='cuda')
    output = torch.matmul(big_batch, weights)
    loss = output.sum()
    loss.backward()
    
    torch.cuda.synchronize()
    end_memory = torch.cuda.memory_allocated() / 1024 / 1024
    big_batch_memory = end_memory - start_memory
    
    print(f"Large batch approach: Memory = {big_batch_memory:.2f} MB")
    
    # Free memory
    del big_batch, weights, output, loss
    torch.cuda.empty_cache()
    
    # Gradient accumulation approach
    torch.cuda.empty_cache()
    start_memory = torch.cuda.memory_allocated() / 1024 / 1024
    peak_memory = start_memory
    
    weights = torch.rand(model_size, model_size, device='cuda')
    weights.requires_grad = True
    
    for i in range(accumulation_steps):
        small_batch = torch.rand(batch_size, model_size, device='cuda')
        output = torch.matmul(small_batch, weights)
        loss = output.sum() / accumulation_steps
        loss.backward()
        
        current_memory = torch.cuda.memory_allocated() / 1024 / 1024
        peak_memory = max(peak_memory, current_memory)
        
        # Free intermediate tensors
        del small_batch, output, loss
    
    torch.cuda.synchronize()
    accum_memory = peak_memory - start_memory
    
    print(f"Gradient accumulation approach: Memory = {accum_memory:.2f} MB")
    print(f"Memory saving: {(1 - accum_memory / big_batch_memory) * 100:.1f}%")
    
    # Free memory
    del weights
    torch.cuda.empty_cache()

# Uncomment to run the test
# test_memory_optimization()

## 5. System-specific Optimization Recommendations

Generate optimization recommendations based on the testing results and system configuration.

In [None]:
def generate_optimization_recommendations():
    recommendations = []
    
    # GPU availability recommendations
    if gpu_manager.check_gpu_availability():
        gpu_info = gpu_manager.get_gpu_info()
        total_gpu_memory = sum(gpu.get('memory_total_mb', 0) for gpu in gpu_info)
        
        # GPU memory recommendations
        if total_gpu_memory < 8000:  # Less than 8GB
            recommendations.append("Limited GPU memory detected. Consider these optimizations:")
            recommendations.append("- Use mixed precision (FP16) training when possible")
            recommendations.append("- Implement gradient accumulation for large models")
            recommendations.append("- Reduce batch sizes and increase accumulation steps")
        elif total_gpu_memory < 16000:  # Between 8GB and 16GB
            recommendations.append("Moderate GPU memory available. Consider these optimizations:")
            recommendations.append("- Use mixed precision for models larger than 100M parameters")
            recommendations.append("- Monitor memory usage during training with the system monitoring utilities")
        else:  # More than 16GB
            recommendations.append("Large GPU memory available. Optimize for speed:")
            recommendations.append("- Increase batch sizes for better throughput")
            recommendations.append("- Use model parallelism for very large models")
            
        # Multi-GPU recommendations
        if len(gpu_info) > 1:
            recommendations.append("\nMultiple GPUs detected. Consider these strategies:")
            recommendations.append("- Use DataParallel/DistributedDataParallel for training")
            recommendations.append("- Allocate different experiments to different GPUs")
            recommendations.append("- Use the NVIDIA monitoring tools to track individual GPU usage")
    else:
        recommendations.append("No GPU detected. For improved performance:")
        recommendations.append("- Consider running on a GPU-enabled system for deep learning tasks")
        recommendations.append("- Optimize CPU usage by setting proper thread counts")
        recommendations.append("- Use smaller models or quantized versions when possible")
    
    # Framework-specific recommendations
    if 'torch' in sys.modules and torch.cuda.is_available():
        recommendations.append("\nPyTorch-specific optimizations:")
        recommendations.append("- Use torch.cuda.amp.autocast for automatic mixed precision")
        recommendations.append("- Implement torch.cuda.empty_cache() between large operations")
        recommendations.append("- Consider torch.backends.cudnn.benchmark=True for repeated operations")
    
    if 'tensorflow' in sys.modules and len(tf.config.list_physical_devices('GPU')) > 0:
        recommendations.append("\nTensorFlow-specific optimizations:")
        recommendations.append("- Use tf.config.experimental.set_memory_growth(gpu, True) to avoid memory spikes")
        recommendations.append("- Enable mixed precision with tf.keras.mixed_precision.set_global_policy('mixed_float16')")
        recommendations.append("- Use TF datasets with prefetch for better GPU utilization")
    
    return recommendations

# Get and display optimization recommendations
recommendations = generate_optimization_recommendations()
print("GPU Optimization Recommendations\n" + "-"*30)
for rec in recommendations:
    print(rec)