In [1]:
# workspace/notebooks/gpu/benchmark_68gb_vram.ipynb

import torch
import time
import numpy as np
import psutil
import os
import sys

print("=== 68GB VRAM Comprehensive Benchmark ===")
print(f"PyTorch version: {torch.__version__}")
print(f"ROCm available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    total_memory_gb = torch.cuda.get_device_properties(device).total_memory / 1e9
    print(f"Total VRAM: {total_memory_gb:.2f} GB")
    print(f"Device: {torch.cuda.get_device_name(0)}")
    print(f"ROCm version: {torch.version.hip}")
    
    # Clear any previous allocations
    torch.cuda.empty_cache()
    
    # ===== TEST 1: Large Tensor Operations =====
    print("\n" + "="*50)
    print("TEST 1: Large Tensor Operations")
    print("="*50)
    
    # Test with half precision (float16) to save memory
    dtype = torch.float16
    
    # Try allocating 20GB tensor
    try:
        # 20GB in float16: 20e9 bytes / 2 bytes per element = 10e9 elements
        # Let's do sqrt(10e9) ≈ 100,000 for square matrix
        size = 100000
        print(f"Allocating {size}x{size} matrix in float16...")
        
        start = time.time()
        A = torch.randn(size, size, dtype=dtype, device=device)
        allocation_time = time.time() - start
        
        allocated_gb = torch.cuda.memory_allocated() / 1e9
        print(f"✓ Allocated {allocated_gb:.2f} GB in {allocation_time:.2f} seconds")
        
        # Matrix multiplication (smaller to avoid OOM)
        print("Performing matrix multiplication (subset)...")
        start = time.time()
        B = A[:1000, :1000]  # Take subset
        C = B @ B.T
        mm_time = time.time() - start
        print(f"✓ Matrix multiplication: {mm_time:.2f} seconds")
        
        del A, B, C
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"✗ Failed: {e}")
    
    # ===== TEST 2: Large Model Training =====
    print("\n" + "="*50)
    print("TEST 2: Large Model Training")
    print("="*50)
    
    try:
        # Create a large model (~10GB parameters in float16)
        print("Creating large neural network...")
        model = torch.nn.Sequential(
            torch.nn.Linear(8192, 16384),
            torch.nn.ReLU(),
            torch.nn.Linear(16384, 8192),
            torch.nn.ReLU(),
            torch.nn.Linear(8192, 4096),
            torch.nn.ReLU(),
            torch.nn.Linear(4096, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 10)
        ).to(device).to(dtype)  # Convert to half precision
        
        # Count parameters
        total_params = sum(p.numel() for p in model.parameters())
        print(f"Model parameters: {total_params:,}")
        print(f"Model size: {total_params * 2 / 1e9:.2f} GB (float16)")
        
        # Training setup
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        criterion = torch.nn.CrossEntropyLoss()
        
        # Create synthetic data
        batch_size = 64
        print(f"Creating synthetic data (batch_size={batch_size})...")
        inputs = torch.randn(batch_size, 8192, dtype=dtype, device=device)
        targets = torch.randint(0, 10, (batch_size,), device=device)
        
        # Training loop
        print("Running training steps...")
        times = []
        for step in range(10):
            torch.cuda.synchronize()
            start = time.time()
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            torch.cuda.synchronize()
            step_time = time.time() - start
            times.append(step_time)
            
            if step % 2 == 0:
                print(f"  Step {step}: loss={loss.item():.4f}, time={step_time:.3f}s")
        
        avg_time = np.mean(times)
        print(f"✓ Average step time: {avg_time:.3f} seconds")
        
        del model, optimizer, inputs, targets
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"✗ Failed: {e}")
    
    # ===== TEST 3: Memory Management =====
    print("\n" + "="*50)
    print("TEST 3: Memory Management & Fragmentation")
    print("="*50)
    
    try:
        # Try allocating multiple large tensors to test fragmentation
        print("Testing memory allocation pattern...")
        
        tensor_sizes = [2, 4, 8, 16]  # GB
        tensors = []
        
        for i, size_gb in enumerate(tensor_sizes):
            try:
                # Calculate elements needed for size_gb in float32
                elements = int(size_gb * 1e9 / 4)
                dim = int(np.sqrt(elements))
                
                print(f"  Allocating {size_gb}GB tensor ({dim}x{dim})...")
                tensor = torch.randn(dim, dim, dtype=torch.float32, device=device)
                tensors.append(tensor)
                print(f"  ✓ Success - Total allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
                
            except Exception as e:
                print(f"  ✗ Failed at {size_gb}GB: {e}")
                break
        
        # Clean up
        print("Cleaning up...")
        for tensor in tensors:
            del tensor
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"✗ Failed: {e}")
    
    # ===== TEST 4: Multi-Task Parallelism =====
    print("\n" + "="*50)
    print("TEST 4: Multi-Task Parallelism")
    print("="*50)
    
    try:
        print("Testing parallel tensor operations...")
        
        # Create multiple smaller tensors
        num_tensors = 10
        tensor_size = (1000, 1000, 100)  # ~400MB each in float32
        
        start = time.time()
        tensors = [torch.randn(*tensor_size, device=device) for _ in range(num_tensors)]
        creation_time = time.time() - start
        
        print(f"Created {num_tensors} tensors in {creation_time:.2f} seconds")
        print(f"Total memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        
        # Process them
        start = time.time()
        results = []
        for i, tensor in enumerate(tensors):
            result = tensor.mean() + tensor.std()
            results.append(result)
            if i % 2 == 0:
                print(f"  Processed tensor {i+1}/{num_tensors}")
        
        process_time = time.time() - start
        print(f"Processed all tensors in {process_time:.2f} seconds")
        
        del tensors, results
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"✗ Failed: {e}")
    
    # ===== FINAL MEMORY REPORT =====
    print("\n" + "="*50)
    print("FINAL MEMORY REPORT")
    print("="*50)
    
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    max_allocated = torch.cuda.max_memory_allocated() / 1e9
    
    print(f"Currently allocated: {allocated:.2f} GB")
    print(f"Currently reserved:  {reserved:.2f} GB")
    print(f"Max allocated:       {max_allocated:.2f} GB")
    print(f"Available VRAM:      {total_memory_gb - allocated:.2f} GB")
    
    # Memory info
    print("\nMemory breakdown:")
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        print(f"  GPU {i}: {props.name}")
        print(f"    Total: {props.total_memory / 1e9:.2f} GB")
        print(f"    Multiprocessors: {props.multi_processor_count}")
    
    # Memory management tips
    print("\n" + "="*50)
    print("MEMORY MANAGEMENT TIPS")
    print("="*50)
    print("1. Use float16 for large models (half memory)")
    print("2. Clear cache regularly: torch.cuda.empty_cache()")
    print("3. Use gradient checkpointing for huge models")
    print("4. Set environment variable to reduce fragmentation:")
    print("   PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True")
    print("5. Use batch sizes that fit in memory")
    print(f"6. Your system can handle models up to ~{total_memory_gb * 0.8:.0f} GB")
    
else:
    print("No GPU available!")

# System info
print("\n" + "="*50)
print("SYSTEM INFORMATION")
print("="*50)
print(f"CPU: {psutil.cpu_count()} cores")
print(f"RAM: {psutil.virtual_memory().total / 1e9:.2f} GB total")
print(f"Disk: {psutil.disk_usage('/').total / 1e9:.2f} GB total")
print(f"Python: {sys.version}")

print("\n" + "="*50)
print("BENCHMARK COMPLETE!")
print("="*50)

=== 68GB VRAM Comprehensive Benchmark ===
PyTorch version: 2.9.1+rocm7.1.1.git351ff442
ROCm available: True
Total VRAM: 68.72 GB
Device: AMD Radeon Graphics
ROCm version: 7.1.52802-26aae437f6

TEST 1: Large Tensor Operations
Allocating 100000x100000 matrix in float16...
✓ Allocated 20.00 GB in 0.07 seconds
Performing matrix multiplication (subset)...
✓ Matrix multiplication: 0.16 seconds

TEST 2: Large Model Training
Creating large neural network...
Model parameters: 313,166,090
Model size: 0.63 GB (float16)
Creating synthetic data (batch_size=64)...
Running training steps...
  Step 0: loss=2.3008, time=0.199s
  Step 2: loss=nan, time=0.070s
  Step 4: loss=nan, time=0.069s
  Step 6: loss=nan, time=0.069s
  Step 8: loss=nan, time=0.069s
✓ Average step time: 0.082 seconds

TEST 3: Memory Management & Fragmentation
Testing memory allocation pattern...
  Allocating 2GB tensor (22360x22360)...
  ✓ Success - Total allocated: 3.41 GB
  Allocating 4GB tensor (31622x31622)...
  ✓ Success - Tota