In [7]:
import psutil
import time
try:
    import pynvml
    pynvml.nvmlInit()
    NVML_AVAILABLE = True
except:
    NVML_AVAILABLE = False
from nvitop import Device, CudaDevice, MigDevice,NA

class SystemMetrics:
    def __init__(self):
        self.prev_read_bytes = 0
        self.prev_write_bytes = 0
        self.prev_net_bytes_recv = 0
        self.prev_net_bytes_sent = 0
        self.prev_time = time.time()
        self._initialize_counters()
        self.devices = Device.all()

    def _initialize_counters(self):
        io_counters = psutil.net_io_counters()
        self.prev_net_bytes_recv = io_counters.bytes_recv
        self.prev_net_bytes_sent = io_counters.bytes_sent
        disk_io = psutil.disk_io_counters()
        self.prev_read_bytes = disk_io.read_bytes
        self.prev_write_bytes = disk_io.write_bytes

    def get_cpu_metrics(self):
        return {
            'cpu_percentages': psutil.cpu_percent(percpu=True),
            'cpu_freqs': psutil.cpu_freq(percpu=True),
            'mem_percent': psutil.virtual_memory().percent
        }

    def get_disk_metrics(self):
        current_time = time.time()
        io_counters = psutil.disk_io_counters()
        disk_usage = psutil.disk_usage('/')
        
        time_delta = max(current_time - self.prev_time, 1e-6)
        
        read_speed = (io_counters.read_bytes - self.prev_read_bytes) / (1024**2) / time_delta
        write_speed = (io_counters.write_bytes - self.prev_write_bytes) / (1024**2) / time_delta
        
        self.prev_read_bytes = io_counters.read_bytes
        self.prev_write_bytes = io_counters.write_bytes
        self.prev_time = current_time
        
        return {
            'read_speed': read_speed,
            'write_speed': write_speed,
            'disk_used': disk_usage.used,
            'disk_total': disk_usage.total
        }

    def get_network_metrics(self):
        current_time = time.time()
        net_io_counters = psutil.net_io_counters()
        
        time_delta = max(current_time - self.prev_time, 1e-6)
        
        download_speed = (net_io_counters.bytes_recv - self.prev_net_bytes_recv) / (1024 ** 2) / time_delta
        upload_speed = (net_io_counters.bytes_sent - self.prev_net_bytes_sent) / (1024 ** 2) / time_delta
        
        self.prev_net_bytes_recv = net_io_counters.bytes_recv
        self.prev_net_bytes_sent = net_io_counters.bytes_sent
        self.prev_time = current_time
        
        return {
            'download_speed': download_speed,
            'upload_speed': upload_speed
        }

    def get_gpu_metrics(self):
        gpu_metrics = []
        for device in self.devices:
            with device.oneshot():
                gpu_metrics.append({
                    'gpu_util': device.gpu_utilization() if device.gpu_utilization() is not NA else -1,
                    'mem_used': device.memory_used() / (1024**3) if device.memory_used() is not NA else -1,
                    'mem_total': device.memory_total() / (1024**3) if device.memory_total() is not NA else -1,
                    'temperature': device.temperature() if device.temperature() is not NA else -1,
                    'fan_speed': device.fan_speed() if device.fan_speed() is not NA else -1,
                })
            
        return gpu_metrics[0]


In [35]:
# ground_control/utils/system_metrics.py

from typing import List, Union
import nvitop  # Ensure nvitop is installed: pip install nvitop

def get_all_gpu_devices() -> List[Union[nvitop.Device, nvitop.MigDevice]]:
    """
    Combine Physical Devices and MIG Devices into a single list.
    If a PhysicalDevice has MIGs, include the MIGs instead of the PhysicalDevice.
    If not, include the PhysicalDevice itself.
    
    Returns:
        List of GPU devices (PhysicalDevice or MigDevice)
    """
    physical_devices = nvitop.Device.all()
    mig_devices = nvitop.MigDevice.all()
    
    # Create a mapping from PhysicalDevice index to its MigDevices
    mig_map = {}
    for mig in mig_devices:
        phys_idx, mig_idx = mig.index  # Assuming index is a tuple (physical_idx, mig_idx)
        if phys_idx not in mig_map:
            mig_map[phys_idx] = []
        mig_map[phys_idx].append(mig)
    
    # Build the combined device list
    combined_devices = []
    for phys_dev in physical_devices:
        if phys_dev.index in mig_map:
            # If PhysicalDevice has MIGs, include all its MIGs
            combined_devices.extend(mig_map[phys_dev.index])
        else:
            # If no MIGs, include the PhysicalDevice itself
            combined_devices.append(phys_dev)
    
    return combined_devices
get_all_gpu_devices()

[MigDevice(index=(0, 0), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB),
 MigDevice(index=(0, 1), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB),
 MigDevice(index=(1, 0), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB),
 MigDevice(index=(1, 1), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB),
 PhysicalDevice(index=2, name='NVIDIA A100-PCIE-40GB', total_memory=40.00GiB),
 MigDevice(index=(3, 0), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB),
 MigDevice(index=(3, 1), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB),
 PhysicalDevice(index=4, name='NVIDIA A100-PCIE-40GB', total_memory=40.00GiB)]

In [36]:
import nvitop
for g in get_all_gpu_devices():
    print(g)
    print(g.utilization_rates())
    print(g.memory_used()/g.memory_total())
    print(g.is_available())
    print()
    # print(g.temperature())
    # print(g.fan_speed())
    # print(g.processes())
    # print(g.utilization())
    # print(g.memory_info())
    # print(g.ecc_errors())

MigDevice(index=(0, 0), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB)
UtilizationRates(gpu='N/A', memory='N/A', encoder='N/A', decoder='N/A')
0.0018717447916666667
True

MigDevice(index=(0, 1), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB)
UtilizationRates(gpu='N/A', memory='N/A', encoder='N/A', decoder='N/A')
0.1251064202724359
True

MigDevice(index=(1, 0), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB)
UtilizationRates(gpu='N/A', memory='N/A', encoder='N/A', decoder='N/A')
0.6978196364182693
True

MigDevice(index=(1, 1), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB)
UtilizationRates(gpu='N/A', memory='N/A', encoder='N/A', decoder='N/A')
0.5623059395032052
True

PhysicalDevice(index=2, name='NVIDIA A100-PCIE-40GB', total_memory=40.00GiB)
UtilizationRates(gpu=29, memory=10, encoder=0, decoder=0)
0.2954010009765625
True

MigDevice(index=(3, 0), name='NVIDIA A100-PCIE-40GB MIG 3g.20gb', total_memory=19968MiB)
Util