In [1]:
!pip install psutil --q
!pip install gsutil --q
!pip install gputil --q
! pip install torch torchvision psutil -q


[0m

In [15]:
import torch
import gc
import psutil
import GPUtil

def get_system_resources():
    """
    Get current system resource usage.
    """
    cpu_percent = psutil.cpu_percent()
    ram_percent = psutil.virtual_memory().percent
    
    gpu_info = []
    if torch.cuda.is_available():
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            gpu_info.append({
                'id': gpu.id,
                'memory_total': gpu.memoryTotal,
                'memory_used': gpu.memoryUsed,
                'memory_free': gpu.memoryFree,
                'memory_util': gpu.memoryUtil * 100
            })
    
    return {
        'cpu_percent': cpu_percent,
        'ram_percent': ram_percent,
        'gpu_info': gpu_info
    }

def print_system_resources(resources):
    """
    Print system resources in a readable format.
    """
    print(f"CPU Usage: {resources['cpu_percent']}%")
    print(f"RAM Usage: {resources['ram_percent']}%")
    for gpu in resources['gpu_info']:
        print(f"GPU {gpu['id']}:")
        print(f"  Total Memory: {gpu['memory_total']} MB")
        print(f"  Used Memory: {gpu['memory_used']} MB")
        print(f"  Free Memory: {gpu['memory_free']} MB")
        print(f"  Memory Utilization: {gpu['memory_util']:.2f}%")

def gpu_cleanup():
    """
    Perform a thorough GPU memory cleanup and report system resources before and after.
    """
    print("System resources before cleanup:")
    resources_before = get_system_resources()
    print_system_resources(resources_before)

    print("\nPerforming GPU memory cleanup...")
    
    # Clear PyTorch's CUDA cache
    torch.cuda.empty_cache()

    # Force garbage collection
    gc.collect()

    # Explicitly clear any remaining CUDA memory
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            with torch.cuda.device(f'cuda:{i}'):
                torch.cuda.empty_cache()
                torch.cuda.ipc_collect()

    print("\nSystem resources after cleanup:")
    resources_after = get_system_resources()
    print_system_resources(resources_after)

    print("\nGPU memory cleanup completed.")

    # Calculate and print the difference
    print("\nMemory freed:")
    for before, after in zip(resources_before['gpu_info'], resources_after['gpu_info']):
        memory_freed = before['memory_used'] - after['memory_used']
        print(f"GPU {before['id']}: {memory_freed} MB")

# Example usage:
if __name__ == "__main__":
    # Your training code here
    
    # After training or when you need to clean up
    gpu_cleanup()

System resources before cleanup:
CPU Usage: 3.8%
RAM Usage: 26.5%
GPU 0:
  Total Memory: 16384.0 MB
  Used Memory: 124.0 MB
  Free Memory: 15992.0 MB
  Memory Utilization: 0.76%
GPU 1:
  Total Memory: 16384.0 MB
  Used Memory: 124.0 MB
  Free Memory: 15992.0 MB
  Memory Utilization: 0.76%

Performing GPU memory cleanup...

System resources after cleanup:
CPU Usage: 3.5%
RAM Usage: 26.5%
GPU 0:
  Total Memory: 16384.0 MB
  Used Memory: 124.0 MB
  Free Memory: 15992.0 MB
  Memory Utilization: 0.76%
GPU 1:
  Total Memory: 16384.0 MB
  Used Memory: 124.0 MB
  Free Memory: 15992.0 MB
  Memory Utilization: 0.76%

GPU memory cleanup completed.

Memory freed:
GPU 0: 0.0 MB
GPU 1: 0.0 MB


In [16]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [17]:
!python distributed_training.py

Rank 1, Epoch 0, Batch 0/98, Loss: 0.0082
GPU 1 Memory: 12498.0MB / 16384.0MB
GPU 1 Utilization: 90.00%
CPU Usage: 12.3%, RAM Usage: 28.7%
Rank 0, Epoch 0, Batch 0/98, Loss: 0.0076
GPU 0 Memory: 12498.0MB / 16384.0MB
GPU 0 Utilization: 100.00%
CPU Usage: 12.3%, RAM Usage: 28.7%
Rank 0, Epoch 0, Batch 10/98, Loss: 0.0025
GPU 0 Memory: 14478.0MB / 16384.0MB
GPU 0 Utilization: 100.00%
CPU Usage: 7.0%, RAM Usage: 28.7%
Rank 1, Epoch 0, Batch 10/98, Loss: 0.0025
GPU 1 Memory: 14478.0MB / 16384.0MB
GPU 1 Utilization: 100.00%
CPU Usage: 7.0%, RAM Usage: 28.7%
Rank 0, Epoch 0, Batch 20/98, Loss: 0.0011
GPU 0 Memory: 14478.0MB / 16384.0MB
GPU 0 Utilization: 100.00%
Rank 1, Epoch 0, Batch 20/98, Loss: 0.0015
GPU 1 Memory: 14478.0MB / 16384.0MB
GPU 1 Utilization: 97.00%
CPU Usage: 6.3%, RAM Usage: 28.7%
CPU Usage: 6.3%, RAM Usage: 28.7%
Rank 0, Epoch 0, Batch 30/98, Loss: 0.0122
GPU 0 Memory: 14478.0MB / 16384.0MB
GPU 0 Utilization: 100.00%
CPU Usage: 6.5%, RAM Usage: 28.7%
Rank 1, Epoch 0, Batch