In [3]:
# workspace/notebooks/gpu/memory_optimization.ipynb

import torch
import gc

print("=== Memory Optimization Guide for 68GB VRAM ===")

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    total_memory = torch.cuda.get_device_properties(device).total_memory / 1e9
    
    print(f"Total VRAM: {total_memory:.2f} GB")
    print("\nMemory Management Commands:")
    
    print("\n1. Check current memory usage:")
    print(f"   torch.cuda.memory_allocated(): {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"   torch.cuda.memory_reserved(): {torch.cuda.memory_reserved() / 1e9:.2f} GB")
    print(f"   torch.cuda.max_memory_allocated(): {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
    
    print("\n2. Clear memory:")
    print("   torch.cuda.empty_cache() - Clears unused cached memory")
    print("   gc.collect() - Runs Python garbage collector")
    
    print("\n3. Environment variables for docker-compose:")
    print("""
    environment:
      # Prevent memory fragmentation
      PYTORCH_CUDA_ALLOC_CONF: "max_split_size_mb:512" #,expandable_segments:True"
      PYTORCH_HIP_ALLOC_CONF: "max_split_size_mb:512"  #,expandable_segments:True"
      
      # Memory optimization
      HIP_VISIBLE_DEVICES: "0"
      ROCR_VISIBLE_DEVICES: "0"
      
      # CPU optimization
      OMP_NUM_THREADS: "32"
      MKL_NUM_THREADS: "32"
    """)
    
    print("\n4. PyTorch memory optimization techniques:")
    print("""
    a) Use mixed precision training:
        from torch.cuda.amp import autocast, GradScaler
        
    b) Use gradient checkpointing:
        torch.utils.checkpoint.checkpoint(model, x)
        
    c) Use DataLoader with pin_memory=True:
        DataLoader(..., pin_memory=True, num_workers=4)
        
    d) Clear gradients periodically:
        optimizer.zero_grad(set_to_none=True)
        
    e) Use in-place operations:
        x.relu_() instead of x = torch.relu(x)
    """)
    
    print("\n5. Example workflow for large models:")
    print("""
    # Step 1: Check available memory
    available = total_memory - torch.cuda.memory_allocated()/1e9
    
    # Step 2: Estimate model size
    # float32: 4 bytes per parameter
    # float16: 2 bytes per parameter
    
    # Step 3: Leave 10% buffer
    max_model_size = available * 0.9
    
    # Step 4: Load model in appropriate precision
    if max_model_size > 30:  # GB
        dtype = torch.float32  # Full precision
    else:
        dtype = torch.float16  # Half precision
    """)
    
    # Test different batch sizes
    print("\n6. Recommended batch sizes for your system:")
    print("""
    • Small models (<1GB): batch_size 128-256
    • Medium models (1-10GB): batch_size 32-64  
    • Large models (10-30GB): batch_size 8-16
    • Very large models (30-50GB): batch_size 1-4
    """)
    
    print(f"\n7. With your {total_memory:.1f} GB VRAM, you can:")
    print(f"   • Run models up to {total_memory * 0.8:.0f} GB")
    print(f"   • Train with datasets up to {total_memory * 0.6:.0f} GB in memory")
    print(f"   • Keep {total_memory * 0.1:.0f} GB free for operations")
    
else:
    print("No GPU available!")

print("\n" + "="*60)
print("Memory optimization functions loaded successfully!")
print("="*60)

=== Memory Optimization Guide for 68GB VRAM ===
Total VRAM: 68.72 GB

Memory Management Commands:

1. Check current memory usage:
   torch.cuda.memory_allocated(): 0.00 GB
   torch.cuda.memory_reserved(): 0.00 GB
   torch.cuda.max_memory_allocated(): 0.00 GB

2. Clear memory:
   torch.cuda.empty_cache() - Clears unused cached memory
   gc.collect() - Runs Python garbage collector

3. Environment variables for docker-compose:

    environment:
      # Prevent memory fragmentation
      PYTORCH_CUDA_ALLOC_CONF: "max_split_size_mb:512" #,expandable_segments:True"
      PYTORCH_HIP_ALLOC_CONF: "max_split_size_mb:512"  #,expandable_segments:True"

      # Memory optimization
      HIP_VISIBLE_DEVICES: "0"
      ROCR_VISIBLE_DEVICES: "0"

      # CPU optimization
      OMP_NUM_THREADS: "32"
      MKL_NUM_THREADS: "32"
    

4. PyTorch memory optimization techniques:

    a) Use mixed precision training:
        from torch.cuda.amp import autocast, GradScaler

    b) Use gradient checkpointin