In [1]:
import ray
import torch
import time
import os

def get_custom_gpu_names():
    # check if ray is initialized
    if not ray.is_initialized():
        ray.init(address="auto", ignore_reinit_error=True)
    # Get all resources in the Ray cluster
    resources = ray.cluster_resources()
    gpu_names = [k for k in resources.keys() if "_GPU" in k]
    # sort alphabetically
    gpu_names.sort()
    return gpu_names

def select_gpu():
    assigned_resources = ray.get_runtime_context().get_assigned_resources()
    resource_name = [k for k in assigned_resources.keys() if "_GPU" in k][0]  # e.g., "node1_GPU0"
    local_gpu_index = int(resource_name.split("_GPU")[-1])       # Extract "0"    
    os.environ["CUDA_VISIBLE_DEVICES"] = str(local_gpu_index)

@ray.remote
class CustomGPU:
    def __init__(self):
        select_gpu()

    def get_free_memory(self):
        free = torch.cuda.mem_get_info(0)[0] / 1024 / 1024 / 1024 # GB
        self.free_memory = free
        return free

def find_top_k_gpu(k=1):
    print(f"Finding top {k} GPU...")
    # check if ray is initialized
    if not ray.is_initialized():
        ray.init(address="auto", ignore_reinit_error=True)
    gpu_names = get_custom_gpu_names()
    gpu_free_memory = []
    for gpu_name in gpu_names:
        actor = CustomGPU.options(resources={gpu_name: 0.01}).remote()
        free_memory = ray.get(actor.get_free_memory.remote())
        gpu_free_memory.append((gpu_name, free_memory))
        print(f"GPU: {gpu_name}, Free memory: {free_memory:.2f} GB")
        ray.kill(actor)
    # sort by free memory
    gpu_free_memory.sort(key=lambda x: x[1], reverse=True)
    gpu_names = [gpu_name for gpu_name, _ in gpu_free_memory]
    if k == 0:
        return gpu_names
    top_k_gpu = gpu_names[:k]
    return top_k_gpu

def find_eligible_gpu(gpu_names, n_gpu=4, free_memory_threshold=1):
    # find all GPUs with free memory greater than the threshold in unit of GB
    eligible_gpu = []
    for gpu_name in gpu_names:
        actor = CustomGPU.options(resources={gpu_name: 0.01}).remote()
        free_memory = ray.get(actor.get_free_memory.remote())
        if free_memory > free_memory_threshold:
            eligible_gpu.append(gpu_name)
            print(f"Found eligible GPU: {gpu_name}, Free memory: {free_memory:.2f} GB")
        if len(eligible_gpu) >= n_gpu:
            return eligible_gpu
    return None


In [2]:
def my_task():
    current_time = time.localtime()
    print(f'start task at {current_time.tm_hour:02d}:{current_time.tm_min:02d}:{current_time.tm_sec:02d}')
    time.sleep(5)
    a = torch.randn(1000, 1000)
    b = torch.randn(1000, 1000)
    c = torch.matmul(a, b)
    result = c.shape
    current_time = time.localtime()
    print(f'end task at {current_time.tm_hour:02d}:{current_time.tm_min:02d}:{current_time.tm_sec:02d}')
    return result

@ray.remote
class worker:
    def __init__(self):
        select_gpu()

    def task(self):
        return my_task()

In [None]:

if __name__ == "__main__":
    # sort GPUs by free memory
    sorted_gpu_names = find_top_k_gpu(k=0)
    # Run 4 workers on top 4 GPUs simultaneously
    n_workers = 4
    free_memory_threshold = 1 # GB
    eligible_gpu = find_eligible_gpu(sorted_gpu_names, n_gpu=n_workers, free_memory_threshold=free_memory_threshold)
    print(f"Eligible GPUs: {eligible_gpu}")
    workers = [worker.options(resources={gpu: 0.01}).remote() for gpu in eligible_gpu]
    results = ray.get([worker.task.remote() for worker in workers])
    print(results)
    # shutdown ray
    ray.shutdown()