In [1]:
print("Hello World!")

Hello World!


In [3]:
import ray

# Use the FQDN to explicitly address the service in the 'default' namespace
RAY_HEAD_ADDRESS = "ray://raycluster-latest-head-svc.default.svc.cluster.local:10001"

try:
    # Run this code inside the Python environment of your remote kernel
    ray.init(RAY_HEAD_ADDRESS)
    print("SUCCESS! Ray is connected using the FQDN.")
    # You can now proceed to run Ray tasks
except Exception as e:
    print(f"Connection still failed: {e}")

2025-10-13 00:34:40,921	INFO client_builder.py:241 -- Passing the following kwargs to ray.init() on the server: log_to_driver


Connection still failed: Ray Client is already connected. Maybe you called ray.init("ray://<address>") twice by accident?


In [7]:
import ray
import time
import torch # We'll use torch to prove the environment is GPU-ready

# 1. Ensure Ray is connected (you ran this successfully before)
# Use the FQDN to connect to the Ray Head Service in the 'default' namespace
RAY_HEAD_ADDRESS = "ray://raycluster-latest-head-svc.default.svc.cluster.local:10001"

if not ray.is_initialized():
    try:
        ray.init(RAY_HEAD_ADDRESS)
        print("Ray connection established.")
    except Exception as e:
        print(f"Connection Failed: {e}")
        # If connection fails, check Kubernetes service name or firewall.

# 2. Define a remote function that explicitly requests one GPU
@ray.remote(num_gpus=1)
def check_gpu_status():
    """
    A remote Ray task that runs on a GPU worker and checks for PyTorch's CUDA availability.
    """
    import socket
    
    # Check if a GPU is visible to this worker process
    gpu_available = torch.cuda.is_available()
    
    # Get the worker node's hostname (i.e., the Kubernetes Pod name)
    worker_hostname = socket.gethostname()
    
    return {
        "hostname": worker_hostname,
        "cuda_available": gpu_available,
        "device_count": torch.cuda.device_count()
    }

# 3. Execute the task and retrieve the result
print("Submitting GPU task...")

# Submit the task to the cluster (it will wait for a GPU worker to be available)
future = check_gpu_status.remote()

# Retrieve the result
result = ray.get(future)

# 4. Display results and verify GPU usage
print("\n--- GPU Task Result ---")
print(f"Task executed on node: {result['hostname']}")
print(f"CUDA Available (GPU Found by PyTorch): {result['cuda_available']}")
print(f"CUDA Devices Found: {result['device_count']}")

# 5. Clean up the Ray connection (optional)
# ray.shutdown()

Submitting GPU task...


ModuleNotFoundError: No module named 'torch'

In [5]:
!pip install torch

I0000 00:00:1760315750.613368    2427 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


Collecting torch
  Downloading torch-2.8.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cudnn-cu12==9.10.2.21 (from torch)
  Downloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl.metadata (1.8 kB)
Collecting nvidia-cublas-cu12==